In [3]:
import pandas as pd
from sklearn.utils import shuffle

import re, nltk
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [90]:
df = pd.read_csv("enriched_data_crawler.csv")
print(df.label.value_counts())
arr = [i for i in range(len(df.loc[df.label == 0])) if i % 3 == 1]
df_2 = pd.read_csv("../data/movie_reviews.csv")
df = pd.concat([df_2,df.loc[df.label == 0].iloc[arr].iloc[:26000]])
#df = pd.read_csv("../data/movie_reviews.csv")
df = df[~df.text.isnull()]
df = shuffle(df)

1    268974
0    188856
Name: label, dtype: int64


In [91]:
df.label.value_counts()

1    89658
0    88768
Name: label, dtype: int64

In [4]:
def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    word_list = nltk.word_tokenize(text)
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(word) for word in word_list]
    return stems

In [93]:
X_train, X_test, y_train, y_test  = train_test_split(
        df.text, 
        df.label,
        test_size=0.2, 
        random_state=42, stratify=df.label)

pipeline = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 3),
                              analyzer = 'word', binary = True, max_df= 0.75)), 
                     ('classifier', LogisticRegression(C = 100, class_weight='balanced'))])
model = pipeline.fit(X=X_train, y=y_train)

In [94]:
y_pred = model.predict(X_test)

print (accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.866810513927
             precision    recall  f1-score   support

          0       0.89      0.84      0.86     17754
          1       0.85      0.90      0.87     17932

avg / total       0.87      0.87      0.87     35686



In [5]:
import os
import numpy as np
BASE_DIR = ''
TEXT_DATA_DIR = BASE_DIR + '../data/test'
TEXT_DATA_FILE_1 = "rt-polarity_neg.txt"
TEXT_DATA_FILE_2 = "rt-polarity_pos.txt"
HEADER = True

def load_data():
    x = []
    y = []
    for i in [TEXT_DATA_FILE_1, TEXT_DATA_FILE_2]:
        with open(os.path.join(TEXT_DATA_DIR, i), "r", encoding='utf-8', errors='ignore') as f:
            if HEADER:
                _ = next(f)
            if i[-7:-4] == "pos":
                temp_y = 1
            else: temp_y = 0
            for line in f:
                x.append(line.rstrip("\n"))
                y.append(temp_y)

    return x, y

data, labels = load_data()
labels = np.asarray(labels, dtype='int8')

In [6]:
temp = pd.DataFrame({"text": data, "label": labels})
temp.to_csv("test.csv", index=False)

In [7]:
temp = pd.read_csv("test.csv")
temp.shape

(10660, 2)

In [96]:
y_pred = model.predict(data)

print (accuracy_score(labels, y_pred))
print(classification_report(labels, y_pred))

0.829174484053
             precision    recall  f1-score   support

          0       0.89      0.75      0.81      5330
          1       0.78      0.91      0.84      5330

avg / total       0.84      0.83      0.83     10660



In [51]:
y_pred = model.predict(data)

print (accuracy_score(labels, y_pred))
print(classification_report(labels, y_pred))

0.84296435272
             precision    recall  f1-score   support

          0       0.88      0.79      0.83      5330
          1       0.81      0.89      0.85      5330

avg / total       0.85      0.84      0.84     10660



In [8]:
from sklearn.externals import joblib
trained_model = joblib.load('/Users/vitaliyradchenko/Documents/udsc/check3/xray/preprocess1.pkl')

In [10]:
trained_model

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=Tr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [13]:
len(data)

10660

In [14]:
y_pred = trained_model.predict(data)

print (accuracy_score(labels, y_pred))
print(classification_report(labels, y_pred))

0.836303939962
             precision    recall  f1-score   support

          0       0.89      0.77      0.82      5330
          1       0.80      0.90      0.85      5330

avg / total       0.84      0.84      0.84     10660



In [62]:
df.shape[0] / 3

152610.0

In [15]:
range(0,len(df),3)[1]

3

In [61]:
df.label.value_counts()

1    268974
0    188856
Name: label, dtype: int64

In [63]:
df.loc[range(0,len(df),3), "label"].value_counts()[0] - df.loc[range(0,len(df),3), "label"].value_counts()[1]

-26706

In [64]:
df.loc[df.label == 0].text[40:50].values

array([ ' do ai rap television do cute however dim furthermore breathtaking pistol include do improved away',
       ' breathe ah soul broadcast breathe beautiful although tedious also affecting doer enjoy breathe superior somewhere',
       '"For all the pleasure there is in seeing effective, great-looking black women grappling with major life issues on screen, Waiting to Exhale is an uneven piece."',
       ' entire breathtaking enjoyment effective do swank noticing impressive slate mate confront alongside large-scale growth point above net into do an unequal specimen',
       ' complete affecting thrill skilled breathe fly regarding useful jet mariner contend along dominant heart affair about curtain facing breathe an patchy slice',
       '"With one possible exception, none of its women is at all likable."',
       ' sole probable rejection nobody concerning mine mate do on entire enjoyable',
       ' solitary available omission nothing about owned mariner breathe by complete sympa

In [75]:
df.loc[df.label == 0].iloc[arr].shape

(62952, 2)