In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.linear_model import PassiveAggressiveClassifier

In [2]:
def import_data(path, sep="|"):
    return pd.read_csv(path, sep)

def save_trained_model(model, path):
    joblib.dump(model, path)


def __drop_duplicates(df):
    return df.drop_duplicates()


def __balance_data(df):
    df_positive = df[df.label == 1]
    df_negative = df[df.label == 0]
    pos_vs_neg = len(df_positive) - len(df_negative)
    if pos_vs_neg > 0:
        drop_indices = np.random.choice(df_positive.index, pos_vs_neg, replace=False)
        df_positive = df_positive.drop (drop_indices)
    elif pos_vs_neg < 0:
        pos_vs_neg *= -1
        drop_indices = np.random.choice(df_negative.index, pos_vs_neg, replace=False)
        df_negative = df_negative.drop (drop_indices)
    else:
         return df   
    return pd.concat([df_positive, df_negative])


def __perform_stemming(review):
    stemmer = PorterStemmer()
    return "".join([stemmer.stem(word) for word in review.split()])


def __drop_non_english(df):
    drop_indices= []
    for index, row in df.iterrows():
        try:
            if detect(row['text']) != "en":
                drop_indices.append (index)
        except Exception:
             drop_indices.append (index)
    return df.drop(drop_indices)

    
def __is_numerical(token):
    return str.isnumeric(token)


def preprocessing(df, balance_data=False, drop_non_english=False, drop_duplicates=False, stemming=False, 
                  replace_numerical=False, replacement_of_numerical="NUMERICAL_TOKEN", lowercase=False):
    if balance_data:
        df = __balance_data(df)
    if drop_duplicates:
        df = __drop_duplicates(df)
    if drop_non_english:
        df = __drop_non_english(df)
    def tbt_cleaning(review):
        cleaned_tokens = []
        stemmer = None
        if stemming:
            stemmer = PorterStemmer()
        tokens = nltk.word_tokenize(review)
        for token in tokens:
            if stemming:
                token = stemmer.stem(token)
            if replace_numerical and __is_numerical(token):
                token = replacement_of_numerical
            if lowercase:
                token = token.lower()
            cleaned_tokens.append(token)
        return ' '.join(cleaned_tokens)
    if stemming or replace_numerical or lowercase:
        return df.text.apply(tbt_cleaning)
    return df

In [84]:
dr = import_data('reviews_rt_all.csv')
di = import_data('imdb_small.csv')
frames = [dr, di]
df = pd.concat(frames)
len(df)

152610

In [85]:
df['last_22'] = df.text.str.split().apply(lambda x:  ' '.join(x for x in x[-22:]))
df['text'] = df['last_22']

In [100]:
df = preprocessing(df)

In [101]:
y = df.label
X_train, X_test, y_train, y_test = train_test_split(df.text, y, test_size=0.2, random_state=42, stratify=y)

In [102]:
STOPWORDS = ['the','of', 'is', 'im', 'that', 'it', 'this', 'for', 'with',  'film','you','movie','on', 'was', 'an', 'have',
           'are', 'one', 'at', 'its', 'his', 'from', 'all', 'like', 'more']
#the, of, a, and, to, is, in, that, as, it


In [103]:
# Now lets try putting this into a Count Vectorizer with default params.
vectorizer = CountVectorizer(binary=True, ngram_range=(1,3), stop_words=STOPWORDS)
X_train = vectorizer.fit_transform(X_train)

In [104]:
# Create model
classifier = PassiveAggressiveClassifier(C=0.001, fit_intercept = False, shuffle = False, n_iter = 91, n_jobs = -1)
classifier.fit(X_train, y_train)

PassiveAggressiveClassifier(C=0.001, class_weight=None, fit_intercept=False,
              loss='hinge', n_iter=91, n_jobs=-1, random_state=None,
              shuffle=False, verbose=0, warm_start=False)

In [105]:
# Create prediction label
X_test = vectorizer.transform(X_test)
predictions = classifier.predict(X_test)

In [106]:
# Calculate accuracy and score
print ("Accuracy:", metrics.accuracy_score(y_test, predictions))

Accuracy: 0.798145599895


In [277]:
# Save model and vectorizer
save_trained_model(classifier, 'model_PAC_RT_IMDB.pkl')
save_trained_model(vectorizer, 'vectorizer_PAC_RT_IMDB.pkl')

In [281]:
MODEL_PATH = "model_PAC1.pkl"
VECTORIZER_PATH = "vectorizer_PAC1.pkl"
df = pd.concat([X_test, y_test ], axis=1 )
X = preprocessing(X_test, VECTORIZER_PATH)
model = load_model(MODEL_PATH)
y_pred = model.predict(X)
display_accuracy(df.label, y_pred)

TypeError: cannot concatenate a non-NDFrame object

In [48]:
from itertools import chain
from collections import Counter
rows = Counter(chain.from_iterable(df.text.map(lambda x: str(x).split(" ")))).most_common(40)
STOPWORDS =[x[0] for x in rows]

In [50]:
for word in STOPWORDS:
    words = []
    words.append(word)
    df = preprocessing(df)
    y = df.label
    X_train, X_test, y_train, y_test = train_test_split(df.text, y, test_size=0.2, random_state=42, stratify=y)
    vectorizer = CountVectorizer(binary=True, ngram_range=(1,3), stop_words=words)
    X_train = vectorizer.fit_transform(X_train)
    classifier = PassiveAggressiveClassifier(C=0.001, fit_intercept = False, shuffle = False, n_iter = 91, n_jobs = -1)
    classifier.fit(X_train, y_train)
    X_test = vectorizer.transform(X_test)
    predictions = classifier.predict(X_test)
    Accuracy = metrics.accuracy_score(y_test, predictions)
    check=False
    if Accuracy>=0.813904724461:
        check=True
    print ("Accuracy:", Accuracy, words, check)

Accuracy: 0.815411834087 ['the'] True
Accuracy: 0.813970250967 ['a'] True
Accuracy: 0.812495904593 ['and'] False
Accuracy: 0.814822095538 ['of'] True
Accuracy: 0.81269248411 ['to'] False
Accuracy: 0.814265120241 ['is'] True
Accuracy: 0.814003014219 ['in'] True
Accuracy: 0.814953148549 ['that'] True
Accuracy: 0.815116964812 ['it'] True
Accuracy: 0.81449446301 ['this'] True
Accuracy: 0.81413406723 ['for'] True
Accuracy: 0.813937487714 ['as'] True
Accuracy: 0.813970250967 ['I'] True
Accuracy: 0.813806434703 ['but'] False
Accuracy: 0.81521525457 ['with'] True
Accuracy: 0.813970250967 ['The'] True
Accuracy: 0.813970250967 ['/><br'] True
Accuracy: 0.815018675054 ['film'] True
Accuracy: 0.815346307581 ['you'] True
Accuracy: 0.815248017823 ['movie'] True
Accuracy: 0.814428936505 ['on'] True
Accuracy: 0.813937487714 ['be'] True
Accuracy: 0.811840639539 ['not'] False
Accuracy: 0.814625516021 ['was'] True
Accuracy: 0.814428936505 ['an'] True
Accuracy: 0.814199593736 ['have'] True
Accuracy: 0.8141

In [68]:
Accuracy: 0.813904724461 -- без всего

SyntaxError: invalid syntax (<ipython-input-68-f025135c4141>, line 1)

In [None]:
Accuracy: 0.80024244807