In [26]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import cross_val_score 
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LogisticRegressionCV          
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
import operator

## Get the data we want to analyze

In [2]:
train = ['/data/reddit1.csv', 
         '/data/reddit2.csv',
         '/data/reddit3.csv',
         '/data/reddit4.csv',
         '/data/reddit5.csv']

test =  ['/data/reddit6.csv',
         '/data/reddit7.csv',
         '/data/reddit8.csv',
         '/data/reddit9.csv',
         '/data/reddit10.csv']

finalDf = pd.DataFrame()
finalDf2 = pd.DataFrame()

for file in train:
    df = pd.read_csv(file)
    df = df[['body', 'created_utc', 'controversiality']]
    #adding the controversial 
    controversial = df[df['controversiality'] == 1].copy().reset_index()
    del controversial['index']
    finalDf = finalDf.append(controversial,ignore_index = True)
    #adding the non-controversial 
    non_controversial = df[df['controversiality'] == 0].sample(frac=0.03, replace=False).copy().reset_index()
    del non_controversial['index']
    finalDf = finalDf.append(non_controversial, ignore_index = True)

df_train = finalDf

for file in test:
    df = pd.read_csv(file)
    df = df[['body', 'created_utc', 'controversiality']]
    #adding the controversial 
    controversial = df[df['controversiality'] == 1].copy().reset_index()
    del controversial['index']
    finalDf2 = finalDf2.append(controversial,ignore_index = True)
    #adding the non-controversial 
    non_controversial = df[df['controversiality'] == 0].sample(frac=0.03, replace=False).copy().reset_index()
    del non_controversial['index']
    finalDf2 = finalDf2.append(non_controversial, ignore_index = True)

df_test = finalDf2

del df

In [3]:
df_train = df_train.drop('created_utc', axis=1)
df_train['body'] = df_train['body'].astype(str)

df_test = df_test.drop('created_utc', axis=1)
df_test['body'] = df_test['body'].astype(str)

In [4]:
Y_train = df_train.controversiality
Y_test = df_test.controversiality

In [17]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', analyzer="word")
X_train_tfidf = vectorizer.fit_transform(df_train.body)
X_test_tfidf = vectorizer.transform(df_test.body)

In [34]:
print(df_train.shape)
print(X_train_tfidf.shape)
print(df_test.shape)
print(X_test_tfidf.shape)
print(X_train_tfidf.getrow(1))

(269775, 2)
(269775, 169058)
(264719, 2)
(264719, 169058)
  (0, 88767)	0.176662689981
  (0, 82619)	0.106451022602
  (0, 73596)	0.203820642187
  (0, 159749)	0.183968169192
  (0, 26156)	0.285665338763
  (0, 47702)	0.157232238637
  (0, 89541)	0.194282103652
  (0, 71813)	0.203084519354
  (0, 125733)	0.468540397632
  (0, 129310)	0.467578771188
  (0, 66533)	0.225719532569
  (0, 39167)	0.343297947997
  (0, 60269)	0.299654821251


### Attempting to feature select

In [19]:
numFeatures = 60000
feature_names = vectorizer.get_feature_names()
print("Extracting %d best features by a chi-squared test" % numFeatures)
ch2 = SelectKBest(chi2, k=numFeatures)
X_train = ch2.fit_transform(X_train_tfidf, Y_train)
X_test = ch2.transform(X_test_tfidf)
feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]

Extracting 60000 best features by a chi-squared test


In [None]:
# lasso = LassoCV(cv = 10, n_alphas=50, max_iter=100000, normalize=True)
# lasso.fit(X_train_tfidf, Y_train)

In [None]:
# sfm = SelectFromModel(LassoCV(cv = 10, n_alphas=50, max_iter=100000, normalize=True), threshold='median')
# sfm.fit(X_train_tfidf, Y_train)
# n_features = sfm.transform(X_train_tfidf).shape[1]

In [30]:
elastic_fit = ElasticNet(alpha=.05,l1_ratio=.5,max_iter=100000,normalize=True)
elastic_fit.fit(X_train,Y_train)


ElasticNet(alpha=0.05, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=100000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [31]:
sum(elastic_fit.coef_)

0.0

# Logistic Regression, the easy part

In [20]:
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
model.score(X_train, Y_train)

0.70420906310814568

In [22]:
model.score(X_test, Y_test)

0.6529489760840741

# Ignore for now

# Need to create columns for 
    1. Time of Day
    2. Day of Week
    3. Word Count
    4. top 500 word for controversial (tfidf value)
    5. top 500 word for non-controversial (tfidf value)

In [None]:
df['body'] = df['body'].astype('str')
df['length']=df['body'].str.split(' ').str.len()

### Construct contraversial word list

In [None]:
del df['created_utc']
top_n_words = 300

In [None]:
controversial = df[df['controversiality'] == 1]
comments = controversial['body']
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', analyzer='word')
X = vectorizer.fit_transform(comments)
idf = vectorizer.idf_

tfidf_score = dict(zip(vectorizer.get_feature_names(), idf))
tfidf_score_sorted = sorted(tfidf_score.items(), key=operator.itemgetter(1), reverse=True)

controversial_words = list( tfidf_score_sorted[:top_n_words][x][0] for x in range(top_n_words))

In [None]:
non_controversial = df[df['controversiality'] == 0]
comments = non_controversial['body']
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', analyzer='word')
X = vectorizer.fit_transform(comments)
idf = vectorizer.idf_

tfidf_score = dict(zip(vectorizer.get_feature_names(), idf))
tfidf_score_sorted = sorted(tfidf_score.items(), key=operator.itemgetter(1), reverse=True)

non_controversial_words = list( tfidf_score_sorted[:top_n_words][x][0] for x in range(top_n_words))

In [None]:
words=[]
for word in non_controversial_words: 
    df[word+'count']=0
    words.append(word)
for word in controversial_words:
    df[word+'count']=0
    words.append(word)

In [None]:
# dict for easy lookup
ws = {}
for w in words:
    ws[w] = True

import re
d = {}
for row in df.itertuples():
    for word in re.findall(r"[\w']+", row[1].strip().lower()):
        if word in ws:
            if (word+'count', row[0]) not in d:
                d[(word+'count', row[0])] = 0
            d[(word+'count', row[0])] += 1    
            
# add the dict to df            
for (w, i) in d:
    df.set_value(i, w, d[(w, i)])

del d

In [None]:
# Verify that sum > 0
num = 0
sums = []
cols_to_drop=[]
for col in df.columns:
    if col != 'body':
        su = sum(df[col])
        sums.append(su)
        if su == 0:
            num += 1
            cols_to_drop.append(col)
# TODO: Delete cols that have sum 0            
print("there are %d columns that are empty." % num)
df = df.drop(cols_to_drop, axis = 1)

In [None]:
X = df.drop(['controversiality', 'body'], axis = 1)
y = df['controversiality']
lasso_fit = LassoCV(cv = 10, n_alphas=50, max_iter=100000, normalize=True)
lasso_fit.fit(X, y)

In [None]:
elastic_fit=ElasticNet(alpha=5,l1_ratio=.5,max_iter=100000,normalize=True)
elastic_fit.fit(X,y)
elastic_fit.coef_

In [None]:
col=[]
for i in range(len(lasso_fit.coef_)):
    if lasso_fit.coef_[i]!=0:
        col.append(i + 2)

print(col)

In [None]:
X_logistic = df.ix[:, col]

In [None]:
from sklearn.linear_model import LogisticRegression
model1=LogisticRegression
model1.fit(X_logistic, y)


In [None]:
print(len(df.ix[df['controversiality']==0])/len(df))
print(len(df[df['controversiality'] == 1]) )