### Create features 

In [1]:
import pandas as pd
import nltk
import pickle
import operator
import feature

import warnings; warnings.simplefilter('ignore')
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option('display.float_format', lambda x: '%.0f' % x)



In [2]:
# Import processed data 
# See data_preprocessing folder for details
with open('../data_preprocessing/data/svo_df.pkl', 'rb') as f:
    df = pickle.load(f)

In [3]:
# Import sentiment lexicon
# From: https://github.com/zeeeyang/lexicon_rnn/blob/master/lexicons/sspe.lex2
ts_lex = {}
with open('./data/ts_lex.txt','r',encoding='utf-8') as f:
    for line in f:
        entry = line.split(' ')
        ts_lex[entry[0]] = float(entry[1])

In [4]:
d = df.sample(200)

In [None]:
# define parameters for feature generation
proto_word_args = {
    'text_col': 'full_text_agg', 
    'user_id': 'user_id', 
    'tok_type': 'clean', 
    'isalpha': True,
    'top_k': 100,
    'word_count_thresh': 5
}

hashtag_args = {
    'text_col': 'hashtags_agg', 
    'user_id': 'user_id',
    'top_k': 50,
    'ht_count_thresh': 3
}

topic_model_args = {
    'text_col': 'clean_text_agg',
    'user_id': 'user_id',
    'stops': nltk.corpus.stopwords.words('english') + ['rt'],
    'stemmer': nltk.stem.snowball.SnowballStemmer('english'), 
    'lemmer': None
}

topic_model_params = {
    'num_topic': 20, 
    'max_df': 0.5, 
    'min_df': 1, 
    'max_feature': 1000, 
    'alpha': 0.1, 
    'eta': 0.1,  
    'serialized': None 
}

sent_args = {
    'lexicon': ts_lex,
    'window': 4,
    'count_thresh': 6,
    'top_k': 20,
    'tok_text_col': 'tokenized_text_agg'
    }


In [None]:
X_train_ft, X_test_ft, y_train, y_test, obs = feature.featurize(d, 'label', 0.3, proto_word_args=proto_word_args, hashtag_args=hashtag_args, topic_model_args=topic_model_args, topic_model_params=topic_model_params, sent_args=sent_args)

Took 4 seconds to featurize proto words and hashtags
Took 2 seconds to featurize topics


In [None]:
X_test_ft.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
import time

### Classifiers

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
clfs = {'Random Forest': RandomForestClassifier(n_estimators=100, n_jobs=-1)
#         'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, subsample=0.5, max_depth=5),
#         'AdaBoost': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=100),
#         'Bagging, DT': BaggingClassifier(DecisionTreeClassifier(max_depth=1), max_samples=0.5, max_features=0.5),
#         'Naive Bayes': GaussianNB(),        
#         'Logistic Reg': LogisticRegression(penalty='l1', C=1e5),
#         'SVM': SVC(kernel='rbf', probability=True, random_state=0),
#         'Decision Tree': DecisionTreeClassifier()
            }

In [None]:
def basic_loop(clfs, X_train, y_train, X_test, y_test):
    for key, clf in clfs.items():
        print(key)
        start_time = time.time()
        clf_fit = clf.fit(X_train, y_train)
        print('Score: ', clf_fit.score(X_test, y_test))
        print("--- %s minutes ---" % round((time.time() - start_time)/60, 2))
        print()

In [None]:
basic_loop(clfs, X_train_ft, y_train, X_test_ft, y_test)