In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import cross_val_score 
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LogisticRegressionCV          
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
import operator



## Get the data we want to analyze

In [2]:
train = ['/data/reddit1.csv', 
         '/data/reddit2.csv',
         '/data/reddit3.csv',
         '/data/reddit4.csv',
         '/data/reddit5.csv',
         '/data/reddit6.csv',
         '/data/reddit7.csv',
         '/data/reddit8.csv',]

test =  ['/data/reddit9.csv',
         '/data/reddit10.csv']

finalDf = pd.DataFrame()
finalDf2 = pd.DataFrame()

for file in train:
    df = pd.read_csv(file)
    df = df[['body', 'created_utc', 'controversiality']]
    #adding the controversial 
    controversial = df[df['controversiality'] == 1].copy().reset_index()
    del controversial['index']
    finalDf = finalDf.append(controversial,ignore_index = True)
    #adding the non-controversial 
    non_controversial = df[df['controversiality'] == 0].sample(frac=0.03, replace=False).copy().reset_index()
    del non_controversial['index']
    finalDf = finalDf.append(non_controversial, ignore_index = True)

df_train = finalDf

for file in test:
    df = pd.read_csv(file)
    df = df[['body', 'created_utc', 'controversiality']]
    #adding the controversial 
    controversial = df[df['controversiality'] == 1].copy().reset_index()
    del controversial['index']
    finalDf2 = finalDf2.append(controversial,ignore_index = True)
    #adding the non-controversial 
    non_controversial = df[df['controversiality'] == 0].sample(frac=0.03, replace=False).copy().reset_index()
    del non_controversial['index']
    finalDf2 = finalDf2.append(non_controversial, ignore_index = True)

df_test = finalDf2

del df

df_train = df_train.drop('created_utc', axis=1)
df_train['body'] = df_train['body'].astype(str)

df_test = df_test.drop('created_utc', axis=1)
df_test['body'] = df_test['body'].astype(str)

In [3]:
y_train = df_train.controversiality
y_test = df_test.controversiality

In [4]:
def extractFeature(vectorizer, selector, x_train, y_train, test):
    training = vectorizer.fit_transform(x_train)
    test = vectorizer.transform(test)
    x_train = selector.fit_transform(training, y_train)
    x_test = selector.transform(test)
    return x_train, x_test

In [5]:
def logisticModel(x_train, y_train, x_test, y_test):
    model = LogisticRegressionCV(cv = 5, max_iter=2000, n_jobs=-1)
    model.fit(x_train, y_train)
    accuracy = model.score(x_train, y_train)
    accuracy_cv = model.score(x_test, y_test)
    return accuracy, accuracy_cv, model

In [6]:
#unigram
x_train, x_test = extractFeature(TfidfVectorizer(sublinear_tf=True, max_df=0.7, 
                                                 stop_words='english'),
                                 SelectPercentile(score_func=chi2, percentile=50),
                                 df_train.body,
                                 y_train,
                                 df_test.body
                                )
print(logisticModel(x_train, y_train, x_test, y_test))

(0.72189361518731376, 0.65195123573512226)


In [7]:
#bi-gram
x_train, x_test = extractFeature(TfidfVectorizer(sublinear_tf=True, max_df=0.7, 
                                                 stop_words='english', ngram_range=(2,2)),
                                 SelectPercentile(score_func=chi2, percentile=50),
                                 df_train.body,
                                 y_train,
                                 df_test.body
                                )
print(logisticModel(x_train, y_train, x_test, y_test))

(0.93535048095179296, 0.60008931880160776)


In [8]:
x_train, x_test = extractFeature(CountVectorizer(analyzer='word', stop_words='english'),
                                 SelectPercentile(score_func=chi2, percentile=90),
                                 df_train.body,
                                 y_train,
                                 df_test.body
                                )
print(logisticModel(x_train, y_train, x_test, y_test))

(0.70784129639163851, 0.64179359755228471)


In [None]:
#mix bigram unigram
x_train, x_test = extractFeature(TfidfVectorizer(sublinear_tf=True, max_df=0.7, 
                                                 stop_words='english', ngram_range=(1,2)),
                                 SelectPercentile(score_func=chi2, percentile=90),
                                 df_train.body,
                                 y_train,
                                 df_test.body
                                )
accu, accu2, model = logisticModel(x_train, y_train, x_test, y_test)

# Ignore

# Need to create columns for 
    1. Time of Day
    2. Day of Week
    3. Word Count
    4. top 500 word for controversial (tfidf value)
    5. top 500 word for non-controversial (tfidf value)

In [None]:
df['body'] = df['body'].astype('str')
df['length']=df['body'].str.split(' ').str.len()

### Construct contraversial word list

In [None]:
del df['created_utc']
top_n_words = 300

In [None]:
controversial = df[df['controversiality'] == 1]
comments = controversial['body']
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', analyzer='word')
X = vectorizer.fit_transform(comments)
idf = vectorizer.idf_

tfidf_score = dict(zip(vectorizer.get_feature_names(), idf))
tfidf_score_sorted = sorted(tfidf_score.items(), key=operator.itemgetter(1), reverse=True)

controversial_words = list( tfidf_score_sorted[:top_n_words][x][0] for x in range(top_n_words))

In [None]:
non_controversial = df[df['controversiality'] == 0]
comments = non_controversial['body']
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', analyzer='word')
X = vectorizer.fit_transform(comments)
idf = vectorizer.idf_

tfidf_score = dict(zip(vectorizer.get_feature_names(), idf))
tfidf_score_sorted = sorted(tfidf_score.items(), key=operator.itemgetter(1), reverse=True)

non_controversial_words = list( tfidf_score_sorted[:top_n_words][x][0] for x in range(top_n_words))

In [None]:
words=[]
for word in non_controversial_words: 
    df[word+'count']=0
    words.append(word)
for word in controversial_words:
    df[word+'count']=0
    words.append(word)

In [None]:
# dict for easy lookup
ws = {}
for w in words:
    ws[w] = True

import re
d = {}
for row in df.itertuples():
    for word in re.findall(r"[\w']+", row[1].strip().lower()):
        if word in ws:
            if (word+'count', row[0]) not in d:
                d[(word+'count', row[0])] = 0
            d[(word+'count', row[0])] += 1    
            
# add the dict to df            
for (w, i) in d:
    df.set_value(i, w, d[(w, i)])

del d

In [None]:
# Verify that sum > 0
num = 0
sums = []
cols_to_drop=[]
for col in df.columns:
    if col != 'body':
        su = sum(df[col])
        sums.append(su)
        if su == 0:
            num += 1
            cols_to_drop.append(col)
# TODO: Delete cols that have sum 0            
print("there are %d columns that are empty." % num)
df = df.drop(cols_to_drop, axis = 1)

In [None]:
X = df.drop(['controversiality', 'body'], axis = 1)
y = df['controversiality']
lasso_fit = LassoCV(cv = 10, n_alphas=50, max_iter=100000, normalize=True)
lasso_fit.fit(X, y)

In [None]:
elastic_fit=ElasticNet(alpha=5,l1_ratio=.5,max_iter=100000,normalize=True)
elastic_fit.fit(X,y)
elastic_fit.coef_

In [None]:
col=[]
for i in range(len(lasso_fit.coef_)):
    if lasso_fit.coef_[i]!=0:
        col.append(i + 2)

print(col)

In [None]:
X_logistic = df.ix[:, col]

In [None]:
from sklearn.linear_model import LogisticRegression
model1=LogisticRegression
model1.fit(X_logistic, y)


In [None]:
print(len(df.ix[df['controversiality']==0])/len(df))
print(len(df[df['controversiality'] == 1]) )