In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import cross_val_score 
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LogisticRegressionCV          
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
import operator



In [2]:
train = ['/data/reddit1.csv',
         '/data/reddit2.csv',
         '/data/reddit3.csv']

test =  ['/data/reddit9.csv']

finalDf = pd.DataFrame()
finalDf2 = pd.DataFrame()

for file in train:
    df = pd.read_csv(file)
    df = df[['body', 'created_utc', 'controversiality']]
    #adding the controversial 
    controversial = df[df['controversiality'] == 1].copy().reset_index()
    del controversial['index']
    finalDf = finalDf.append(controversial,ignore_index = True)
    #adding the non-controversial 
    non_controversial = df[df['controversiality'] == 0].sample(frac=0.03, replace=False).copy().reset_index()
    del non_controversial['index']
    finalDf = finalDf.append(non_controversial, ignore_index = True)

df_train = finalDf

for file in test:
    df = pd.read_csv(file)
    df = df[['body', 'created_utc', 'controversiality']]
    #adding the controversial 
    controversial = df[df['controversiality'] == 1].copy().reset_index()
    del controversial['index']
    finalDf2 = finalDf2.append(controversial,ignore_index = True)
    #adding the non-controversial 
    non_controversial = df[df['controversiality'] == 0].copy().reset_index()
    del non_controversial['index']
    finalDf2 = finalDf2.append(non_controversial, ignore_index = True)

df_test = finalDf2

del df

df_train = df_train.drop('created_utc', axis=1)
df_train['body'] = df_train['body'].astype(str)

df_test = df_test.drop('created_utc', axis=1)
df_test['body'] = df_test['body'].astype(str)

In [4]:
y_train = df_train.controversiality
y_test = df_test.controversiality

In [43]:
def extractFeature(vectorizer, selector, x_train, y_train, test):
    Vect = vectorizer
    Sel = selector
    training = Vect.fit_transform(x_train)
    test = Vect.transform(test)
    x_train = Sel.fit_transform(training, y_train)
    x_test = Sel.transform(test)
    return x_train, x_test, Vect, Sel

In [44]:
def logisticModel(x_train, y_train, x_test, y_test):
    model = LogisticRegressionCV(cv = 5, max_iter=2000, n_jobs=-1)
    model.fit(x_train, y_train)
    accuracy = model.score(x_train, y_train)
    accuracy_cv = model.score(x_test, y_test)
    return accuracy, accuracy_cv, model

In [45]:
x_train, x_test, vect, sel = extractFeature(TfidfVectorizer(sublinear_tf=True, max_df=0.7, 
                                                 stop_words='english', ngram_range=(0,2)),
                                 SelectKBest(chi2, k=50000),
                                 df_train.body,
                                 df_train.controversiality,
                                 df_test.body)

In [46]:
print(x_train.shape)
print(df_train.shape)
print(x_test.shape)

(158508, 50000)
(158508, 2)
(1000000, 50000)


In [47]:
model = LogisticRegressionCV(cv = 2, max_iter=1000, n_jobs=-1)
model.fit(x_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=2, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=1000,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [48]:
print("model fitting score: ",  model.score(x_train, y_train))
print("proportion of controversial posts in training set: ", (sum(y_train) / len(y_train)))
print("validation score: " , model.score(x_test, y_test))
print("proportion of controversial posts in testing set: ", (sum(y_test) / len(y_test)))

model fitting score:  0.829106417342
proportion of controversial posts in training set:  0.445573724985
validation score:  0.775838
proportion of controversial posts in testing set:  0.022174


In [49]:
from sklearn.externals import joblib
joblib.dump(model, 'tfidftop50K.pkl', compress=9)

['tfidftop50K.pkl']

In [64]:
joblib.dump(vect, 'tfidfmixbigram.pkl', compress=9)

['tfidfmixbigram.pkl']

In [63]:
joblib.dump(sel, 'tfidfselector.pkl', compress=9)

['tfidfselector.pkl']

In [118]:
row = x_test.getrow(0).nonzero()[1]