### Create features 

In [2]:
import pandas as pd
import nltk
import pickle
import operator
# import feature

import warnings; warnings.simplefilter('ignore')
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [3]:
# Import processed data 
# See data_preprocessing folder for details
with open('../data_preprocessing/data/svo_df.pkl', 'rb') as f:
    df = pickle.load(f)

In [4]:
# Import sentiment lexicon
# From: https://github.com/zeeeyang/lexicon_rnn/blob/master/lexicons/sspe.lex2
ts_lex = {}
with open('./data/ts_lex.txt','r',encoding='utf-8') as f:
    for line in f:
        entry = line.split(' ')
        ts_lex[entry[0]] = float(entry[1])

In [5]:
d, sd = sentiments.featurize_sentiments(df, 'label', 'tokenized_text_agg', ts_lex, 4, 6, 20)

In [12]:
d = df.sample(10)

In [15]:
d.index

Int64Index([13596, 9190, 12555, 23243, 24709, 6637, 18983, 10220, 21943, 2185], dtype='int64')

In [17]:
a = pd.DataFrame()

In [19]:
d.shape

(10, 11)

In [16]:
d.loc[13596,:]

label                                                                 r
html                  RT <a href="https://twitter.com/F6x">@F6x</a>:...
user_id                                                        32585306
full_text_agg         RT @F6x: Hey Minnesota, you do not get to invo...
clean_text_agg         Hey Minnesota, you do not get to invoke His P...
tokenized_text_agg    [[f6x, :, hey, minnesota, ,, you, do, not, get...
to_users_agg                                                      [F6x]
hashtags_agg                                                 [RNCinCLE]
urls_agg                                                             []
parse_sents           [[[(S\n  (NP\n    (NP (NNS f6x))\n    (UCP\n  ...
svos                  [[(f6x, rncincle , ), (you, do get, ), (, hey ...
Name: 13596, dtype: object

In [66]:
# define parameters for feature generation
proto_word_args = {
    'text_col': 'full_text_agg', 
    'user_id': 'user_id', 
    'tok_type': 'clean', 
    'isalpha': True,
    'top_k': 100,
    'word_count_thresh': 5
}

hashtag_args = {
    'text_col': 'hashtags_agg', 
    'user_id': 'user_id',
    'top_k': 50,
    'ht_count_thresh': 3
}

topic_model_args = {
    'text_col': 'clean_text_agg',
    'user_id': 'user_id',
    'stops': nltk.corpus.stopwords.words('english') + ['rt'],
    'stemmer': nltk.stem.snowball.SnowballStemmer('english'), 
    'lemmer': None
}

topic_model_params = {
    'num_topic': 20, 
    'max_df': 0.5, 
    'min_df': 1, 
    'max_feature': 1000, 
    'alpha': 0.1, 
    'eta': 0.1,  
    'serialized': None 
}

sent_args = {
    'lexicon': ts_lex,
    'window': 4,
    'count_thresh': 6,
    'top_k': 20,
    'tok_text_col': 'tokenized_text_agg'
    }


In [None]:
X_train_ft, X_test_ft, y_train, y_test = feature.featurize(df, 'label', proto_word_args, hashtag_args, topic_model_args, topic_model_params, 0.2, random_state=None, topic_words=False)

### Classifiers

In [31]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [35]:
clfs = {'Random Forest': RandomForestClassifier(n_estimators=100, n_jobs=-1),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, subsample=0.5, max_depth=5),
        'AdaBoost': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=100),
        'Bagging, DT': BaggingClassifier(DecisionTreeClassifier(max_depth=1), max_samples=0.5, max_features=0.5),
        'Naive Bayes': GaussianNB(),        
        'Logistic Reg': LogisticRegression(penalty='l1', C=1e5),
        'SVM': SVC(kernel='rbf', probability=True, random_state=0),
        'Decision Tree': DecisionTreeClassifier()
            }

In [41]:
def basic_loop(clfs, X_train, y_train, X_test, y_test):
    for key, clf in clfs.items():
        print(key)
        start_time = time.time()
        clf_fit = clf.fit(X_train, y_train)
        print('Score: ', clf_fit.score(X_test, y_test))
        print("--- %s minutes ---" % round((time.time() - start_time)/60, 2))
        print()

In [42]:
basic_loop(clfs, X_train_ft, y_train, X_test_ft, y_test)

Random Forest
Score:  0.8782346685572492
--- 0.08 minutes ---

Gradient Boosting
Score:  0.8566111308046792
--- 0.69 minutes ---

AdaBoost
Score:  0.84774902516838
--- 0.27 minutes ---

Bagging, DT
Score:  0.848103509393832
--- 0.02 minutes ---

Naive Bayes
Score:  0.5233959588798298
--- 0.01 minutes ---

Logistic Reg
Score:  0.8459766040411202
--- 2.78 minutes ---

SVM
Score:  0.5235732009925558
--- 92.96 minutes ---

Decision Tree
Score:  0.8186813186813187
--- 0.02 minutes ---



In [23]:
clf = Perceptron().fit(X_train_ft, y_train)
clf.score(X_test_ft, y_test) 

0.47660404112017013

In [26]:
clf =SVC().fit(X_train_ft, y_train)
clf.score(X_test_ft, y_test) 

0.5235732009925558

In [2]:
a = pd.read_csv("X_train_ft.csv")

In [8]:
len(b.columns)

323