In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from glob import glob
import numpy as np
import os,re,string
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_extraction.text import CountVectorizer

import codecs

In [2]:
PATH='aclImdb'
names = ['neg','pos']


In [3]:
!ls {PATH}

README
imdb.vocab
imdbEr.txt
test
train


In [4]:
!ls {PATH}/train/pos | head

0_9.txt
10000_8.txt
10001_10.txt
10002_7.txt
10003_8.txt
10004_8.txt
10005_7.txt
10006_7.txt
10007_7.txt
10008_7.txt


ls: write error


In [7]:
def load_texts_labels_from_folders(path, folders):
    texts,labels = [],[]
    for idx,label in enumerate(folders):
        for fname in glob(os.path.join(path, label, '*.*')):
            texts.append(codecs.open(fname, 'r',"utf_8_sig").read())
            labels.append(idx)
    # stored as np.int8 to save space 
    return texts, np.array(labels).astype(np.int8)

text_train,y_train = load_texts_labels_from_folders(f'{PATH}/train',names)
text_test,y_test = load_texts_labels_from_folders(f'{PATH}/test',names)


In [8]:
len(text_train),len(y_train),len(text_test),len(y_test)

(25000, 25000, 25000, 25000)

In [9]:
print(text_train[0])
print()
print(f"Review's label: {y_train[0]}")

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.

Review's label: 0


In [10]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()


#create term documetn matrix
cv = CountVectorizer(tokenizer=tokenize)
cv.fit(text_train)

len(cv.vocabulary_)

75132

In [11]:
print(cv.get_feature_names()[50000:50050])

['pigeonhole', 'pigeonholed', 'pigeons', 'pigface', 'pigging', 'piggish', 'piggly', 'piggy', 'piggys', 'piglet', 'piglets', 'pigmalionize', 'pigozzi', 'pigs', 'pigsty', 'pigtailed', 'pigtails', 'pikachu', 'pike', 'pikes', 'pikser', 'pilar', 'pilate', 'pilcher', 'pile', 'piled', 'piledriver', 'pileggi', 'piles', 'pilfered', 'pilfering', 'pilfers', 'pilger', 'pilgrim', 'pilgrimage', 'pilgrims', 'piling', 'pilippinos', 'pill', 'pillage', 'pillaged', 'pillaging', 'pillar', 'pillars', 'pilliar', 'pilling', 'pillman', 'pilloried', 'pillory', 'pillow']


In [12]:
X_train = cv.transform(text_train)
X_train

<25000x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 3749745 stored elements in Compressed Sparse Row format>

In [13]:
print(text_train[19726])

Enjoyable and watchable. Tim Meadows at his best. A big boost from Billy Dee Williams. He and a very funny John Witherspoon provide a solid foundation for Mr. Meadows' riffing. Have fun with this one.


In [14]:
X_test = cv.transform(text_test)

In [15]:
%%time
logit = LogisticRegression(n_jobs=-1, random_state=7)
logit.fit(X_train, y_train)

Wall time: 17.3 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2', random_state=7,
                   solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [16]:
%%time
svm = LinearSVC(random_state=7)
svm.fit(X_train, y_train)

Wall time: 9.44 s


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=7, tol=0.0001,
          verbose=0)

In [17]:
round(logit.score(X_train, y_train), 3), round(svm.score(X_train, y_train), 3)

(0.999, 1.0)

In [18]:
round(logit.score(X_test, y_test), 3), round(svm.score(X_test, y_test), 3)

(0.87, 0.848)

In [19]:
%%time
from sklearn.pipeline import make_pipeline

text_pipe_logit = make_pipeline(CountVectorizer(), 
                                LogisticRegression(n_jobs=-1, random_state=7))

text_pipe_logit.fit(text_train, y_train)
print(text_pipe_logit.score(text_test, y_test))

0.86676
Wall time: 25.8 s


In [20]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid_logit = {'logisticregression__C': 
                    np.logspace(-5, 0, 6)}
grid_logit = GridSearchCV(text_pipe_logit, param_grid_logit, cv=3, n_jobs=-1)

grid_logit.fit(text_train, y_train)

Wall time: 1min 31s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('countvectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                 

In [21]:
grid_logit.best_params_, grid_logit.best_score_

({'logisticregression__C': 0.01}, 0.86184)

In [22]:
%%time
text_pipe_svm = make_pipeline(CountVectorizer(), LinearSVC(random_state=7))

text_pipe_svm.fit(text_train, y_train)
print(text_pipe_svm.score(text_test, y_test))

0.84568
Wall time: 21.8 s


In [23]:
%%time
param_grid_svm = {'linearsvc__C': np.logspace(-5, 0, 6)}
grid_svm = GridSearchCV(text_pipe_svm, param_grid_svm, cv=3, n_jobs=-1)

grid_svm.fit(text_train, y_train);

Wall time: 1min 31s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('countvectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                 

In [24]:
grid_svm.best_params_, grid_svm.best_score_

({'linearsvc__C': 0.001}, 0.86248)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_pipe = make_pipeline(TfidfVectorizer(), LinearSVC())

param_grid = {'linearsvc__C': np.logspace(-3, 2, 6)}
grid_tfidf = GridSearchCV(tfidf_pipe, param_grid, cv=3, n_jobs=-1)
grid_tfidf.fit(text_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidfvectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                               

In [26]:
grid_tfidf.best_score_, grid_tfidf.best_params_

(0.85872, {'linearsvc__C': 0.1})

***Вывод:получил наилучший score = 0.86248 методом LinearSVC+GridSearch.***