In [1]:
import os
import pandas as pd
import numpy as np
df = pd.DataFrame()
for a in ['test','train']:
    for b in ['neg','pos']:
        files = [f for f in os.listdir('./'+a+'/'+b+'/')]
        for f in files:
              with open ('./'+a+'/'+b+'/'+f, "r") as myfile:
                    df = df.append({'label':b, 'detail':myfile.read()}, ignore_index=True)

In [2]:
seperate = np.random.rand(len(df)) >= 0.3
test = df[seperate]
train = df[~seperate]
print(train.shape)
print(len(train[train['label']=='pos']),len(train[train['label']=='neg']))
print(test.shape)
print(len(test[test['label']=='pos']),len(test[test['label']=='neg']))

(15095, 2)
7495 7600
(34905, 2)
17505 17400


In [4]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

text_clf = Pipeline([('vect', CountVectorizer()),('clf', MultinomialNB()),])
text_clf.fit(train['detail'], train['label'])
print(metrics.classification_report(test['label'], text_clf.predict(test['detail'])))

text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB()),])
text_clf.fit(train['detail'], train['label'])
print(metrics.classification_report(test['label'], text_clf.predict(test['detail'])))

             precision    recall  f1-score   support

        neg       0.82      0.88      0.85     17400
        pos       0.87      0.80      0.83     17505

avg / total       0.84      0.84      0.84     34905

             precision    recall  f1-score   support

        neg       0.82      0.91      0.86     17400
        pos       0.90      0.80      0.84     17505

avg / total       0.86      0.85      0.85     34905



In [5]:
from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([('vect', CountVectorizer()),('clf', LogisticRegression()),])
text_clf.fit(train['detail'], train['label'])
print(metrics.classification_report(test['label'], text_clf.predict(test['detail'])))

text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', LogisticRegression()),])
text_clf.fit(train['detail'], train['label'])
print(metrics.classification_report(test['label'], text_clf.predict(test['detail'])))

             precision    recall  f1-score   support

        neg       0.88      0.87      0.88     17400
        pos       0.87      0.88      0.88     17505

avg / total       0.88      0.88      0.88     34905

             precision    recall  f1-score   support

        neg       0.89      0.87      0.88     17400
        pos       0.88      0.89      0.88     17505

avg / total       0.88      0.88      0.88     34905



In [6]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(2,2))),('clf', MultinomialNB()),])
text_clf.fit(train['detail'], train['label'])
print(metrics.classification_report(test['label'], text_clf.predict(test['detail'])))

text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(2,2))),('clf', MultinomialNB()),])
text_clf.fit(train['detail'], train['label'])
print(metrics.classification_report(test['label'], text_clf.predict(test['detail'])))

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(2,2))),('clf', LogisticRegression()),])
text_clf.fit(train['detail'], train['label'])
print(metrics.classification_report(test['label'], text_clf.predict(test['detail'])))

text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(2,2))),('clf', LogisticRegression()),])
text_clf.fit(train['detail'], train['label'])
print(metrics.classification_report(test['label'], text_clf.predict(test['detail'])))

             precision    recall  f1-score   support

        neg       0.86      0.90      0.88     17400
        pos       0.90      0.85      0.88     17505

avg / total       0.88      0.88      0.88     34905

             precision    recall  f1-score   support

        neg       0.85      0.92      0.88     17400
        pos       0.92      0.83      0.87     17505

avg / total       0.88      0.88      0.88     34905

             precision    recall  f1-score   support

        neg       0.88      0.86      0.87     17400
        pos       0.87      0.88      0.87     17505

avg / total       0.87      0.87      0.87     34905

             precision    recall  f1-score   support

        neg       0.87      0.86      0.87     17400
        pos       0.86      0.88      0.87     17505

avg / total       0.87      0.87      0.87     34905



In [11]:
from fastText import train_supervised 
with open('train.txt', 'a') as the_file:
    for index, row in train.iterrows():
        the_file.write("__label__{0} {1}\n".format(row['label'],row['detail']))
model = train_supervised(input="train.txt")
with open('test.txt', 'a') as the_file:
    for index, row in test.iterrows():
        the_file.write("__label__{0} {1}\n".format(row['label'],row['detail']))
print('precision:',model.test('test.txt')[1],'recall:',model.test('test.txt')[2])

precision: 0.8755479157713795 recall: 0.8755479157713795


In [12]:
from sklearn.externals import joblib
text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', LogisticRegression()),])
text_clf.fit(train['detail'], train['label'])
joblib.dump(text_clf, 'model.pkl')

['model.pkl']