### Create features 

In [1]:
import pandas as pd
import pickle
import operator

In [2]:
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [3]:
with open('../data_preprocessing/data/svo_df.pkl', 'rb') as f:
    df = pickle.load(f)

In [4]:
ts_lex = {}
with open('./data/ts_lex.txt','r',encoding='utf-8') as f:
    for line in f:
        entry = line.split(' ')
        ts_lex[entry[0]] = float(entry[1])

In [5]:
def svo_sentiments(df, lex, label_col, window, count_thresh):
    """
    Given a dataframe, returns sentiments dictionary of words and 
    their associated sentiment score (ranging -1 ~ 1)
    Return dictionary is disaggregated by subject/object and label: 
        sentiments = {'object': {'d': {'word': float ...}}}
    
    Requires `lex`, a sentiment lexicon (dictionary of word key, sentiment val)
    `window` refers to number of words to check above and below target word
    Return dictionary discludes words appearing less than `count_thresh` times
    """
    # get sentiment around svo (aggregate and count of tweets)
    svo_sentiments = {
        'subject': {},
        'object': {}
    }
    
    labels = df[label_col].unique()
    for label in labels:
        for key in svo_sentiments.keys():
            svo_sentiments[key][label] = {}
       
        # all svos for one label
        label_svos = df[df[label_col] == label].reset_index()
        for i in range(label_svos.shape[0]):
            user_svo_list = label_svos.loc[i,'svos']
            for tweet_svo_list in user_svo_list:
                for svo in tweet_svo_list:
                    s = svo[0]
                    o = svo[2]
                    # checking s and o in context
                    tokenized_text_lists = label_svos.loc[i,'tokenized_text_agg']
                    for tokenized_text in tokenized_text_lists:
                        
                        # for 's'ubjects
                        if s != '' and '/' not in s:
                            if s in tokenized_text:
                                s_index = tokenized_text.index(s)
                               
                                # get words in window
                                lower_bound = max(0, s_index-window)
                                upper_bound = min(len(tokenized_text), s_index+1+window)
                                pre_words = tokenized_text[lower_bound : s_index]
                                post_words = tokenized_text[s_index+1: upper_bound]
                                window_words = pre_words + post_words
                                
                                # tally up sentiment score for s 
                                sentiment_total = 0
                                sentiment_count = 0
                                for word in window_words:
                                    if word in lex:
                                        sentiment_total += lex[word]
                                        sentiment_count += 1
                                
                                # build dictionary
                                if sentiment_count != 0:
                                    avg_sentiment = sentiment_total/sentiment_count
                                    if s in svo_sentiments['subject'][label]:
                                        svo_sentiments['subject'][label][s]['total_sentim'] += avg_sentiment
                                        svo_sentiments['subject'][label][s]['count'] += 1    
                                    else:
                                        svo_sentiments['subject'][label][s] = {'total_sentim': avg_sentiment, 'count': 1}
                                else:
                                    continue
                        
                        # for 'o'bjects
                        if o != '' and '/' not in o:
                            if o in tokenized_text:
                                o_index = tokenized_text.index(o)
                               
                                # get words in window
                                lower_bound = max(0, o_index-window)
                                upper_bound = min(len(tokenized_text), o_index+1+window)
                                pre_words = tokenized_text[lower_bound : o_index]
                                post_words = tokenized_text[o_index+1: upper_bound]
                                window_words = pre_words + post_words
                                
                                # tally up score for s 
                                sentiment_total = 0
                                sentiment_count = 0
                                for word in window_words:
                                    if word in lex:
                                        sentiment_total += lex[word]
                                        sentiment_count += 1
                                
                                if sentiment_count != 0:
                                    avg_sentiment = sentiment_total/sentiment_count
                                    if o in svo_sentiments['object'][label]:
                                        svo_sentiments['object'][label][o]['total_sentim'] += avg_sentiment
                                        svo_sentiments['object'][label][o]['count'] += 1    
                                    else:
                                        svo_sentiments['object'][label][o] = {'total_sentim': avg_sentiment, 'count': 1}
                                else:
                                    continue
    
    # get average sentiment (if count above threshold) per label, per word 
    sentiments = {
        'subject': {},
        'object': {}
    }
    for label in labels:
        for key in sentiments.keys():
            sentiments[key][label] = {}
            
    for sv in svo_sentiments.keys():
        for label_key in svo_sentiments[sv]:
            for word_key in svo_sentiments[sv][label_key]:
                if svo_sentiments[sv][label_key][word_key]['count'] >= count_thresh:
                    sentiments[sv][label_key][word_key] = svo_sentiments[sv][label_key][word_key]['total_sentim'] / svo_sentiments[sv][label_key][word_key]['count']

    # return sentiments, svo_sentiments
    return sentiments

In [6]:
sentiments = svo_sentiments(df, ts_lex, 'label', 4, 6)

In [37]:
ss

{'object': {'d': {'narrative': {'count': 6,
    'total_sentim': -1.6705033333333335},
   'man': {'count': 59, 'total_sentim': 9.892546249999999},
   'woman': {'count': 52, 'total_sentim': 2.004590833333332},
   'votes': {'count': 26, 'total_sentim': 6.34646975},
   'nomination': {'count': 28, 'total_sentim': 8.146099700000002},
   'part': {'count': 17, 'total_sentim': 1.8339218333333331},
   'yourself': {'count': 3, 'total_sentim': 0.6315715000000001},
   'platform': {'count': 13, 'total_sentim': 3.3405655},
   'you': {'count': 199, 'total_sentim': 71.86070304999994},
   'dnc': {'count': 112, 'total_sentim': -9.448726683333328},
   'voters': {'count': 9, 'total_sentim': 0.8275083333333332},
   'dogs': {'count': 1, 'total_sentim': 0.132008},
   'demsinphilly': {'count': 249, 'total_sentim': 72.7541872},
   'amp': {'count': 244, 'total_sentim': 50.91533435000004},
   'things': {'count': 25, 'total_sentim': 4.963736450000001},
   'campaign': {'count': 10, 'total_sentim': 3.983261999999999

In [7]:
def most_pos_neg_sents(sent_dict, k):
    """
    """
    k_sentiment_dict = {}
    for sv in sent_dict.keys():
        for label in sent_dict[sv].keys():
            if label not in k_sentiment_dict:
                k_sentiment_dict[label] = {}
            
            # sort words by sentiment value
            sorted_list = sorted(sent_dict[sv][label].items(), key=operator.itemgetter(1))
            sorted_list.reverse()
            
            # get k most positive
            kpos_list = sorted_list[:k] 
            kpos_list = [pair[0] for pair in kpos_list]
            if 'POSITIVE' in k_sentiment_dict[label].keys():
                k_sentiment_dict[label]['POSITIVE'] += kpos_list
            else:
                k_sentiment_dict[label]['POSITIVE'] = kpos_list
            
            # get k most negative words
            kneg_list = sorted_list[len(sorted_list)-k :]
            kneg_list = [pair[0] for pair in kneg_list]
            if 'NEGATIVE' in k_sentiment_dict[label]:
                k_sentiment_dict[label]['NEGATIVE'] += kneg_list
            else:
                k_sentiment_dict[label]['NEGATIVE'] = kneg_list
    
    # find unique and common words between labels (only works for 2 labels!)
    labels = list(k_sentiment_dict.keys())
    sentiments = ['POSITIVE', 'NEGATIVE']
    
    separate_sentiment = {}
    for sentiment in sentiments: 
        label1_pos_words = k_sentiment_dict[labels[0]][sentiment]
        label2_pos_words = k_sentiment_dict[labels[1]][sentiment]
        label1_pos_unique = [word for word in label1_pos_words if word not in label2_pos_words]
        label2_pos_unique = [word for word in label2_pos_words if word not in label1_pos_words]
        common_pos = [word for word in label1_pos_words if word in label2_pos_words]

        if labels[0] not in separate_sentiment:
            separate_sentiment[labels[0]] = {}
        separate_sentiment[labels[0]][sentiment] = label1_pos_unique
        if labels[1] not in separate_sentiment:
            separate_sentiment[labels[1]] = {}
        separate_sentiment[labels[1]][sentiment] = label2_pos_unique
        separate_sentiment[sentiment + '_COMMON'] = common_pos       
    return separate_sentiment#k_sentiment_dict

In [8]:
p=most_pos_neg_sents(s, 20)

In [9]:
p

{'NEGATIVE_COMMON': ['protesters', 'group', 'control'],
 'POSITIVE_COMMON': ['thanks', ']', 'movement', 'conscience', 'lady'],
 'd': {'NEGATIVE': ['democracy',
   'signs',
   'followers',
   'house',
   'speaker',
   'source',
   'seats',
   'rights',
   'leaks',
   '.facebook',
   'heads',
   'bowl',
   'barack',
   'pa',
   'criminals',
   'warming',
   '.realdonaldtrump',
   'davidwohl',
   'fact',
   'philosophy',
   'senator',
   'economy',
   'center',
   'themselves',
   'violence',
   'workers',
   'law',
   'press',
   'professionals',
   'row',
   'email',
   'role',
   'crisis',
   'opposition',
   'morning',
   'the…',
   'teenager'],
  'POSITIVE': ['ddlovato',
   'poll',
   '.flotus',
   'reform',
   'train',
   'fact',
   'leak',
   'silverman',
   'kaine',
   'call',
   'week',
   'chelsea',
   'problem',
   'watch',
   'granholm',
   'tonight',
   'moment',
   'her',
   'you…',
   'bloomberg',
   'topics',
   'tonight.',
   'fun',
   'us—it',
   'kick-ass',
   'yorker',

In [33]:
def featurize_sentiments(df, sent_dict, label_col, tok_text_col, lex, window):
    labels = df[label_col].unique()
    
    sent_features = []
    for label in labels:
        sent_features += sent_dict[label]['NEGATIVE']
        sent_features += sent_dict[label]['POSITIVE']
    for i in range(df.shape[0]):
        tokenized_text_lists = df.loc[i,'tokenized_text_agg']
        i_sent_dict = {}
        for tokenized_text in tokenized_text_lists:
            for sent in sent_features:
                if sent in tokenized_text:
                    sent_index = tokenized_text.index(sent)
                    # get words in window
                    lower_bound = max(0, sent_index-window)
                    upper_bound = min(len(tokenized_text), sent_index+1+window)
                    pre_words = tokenized_text[lower_bound : sent_index]
                    post_words = tokenized_text[sent_index+1 : upper_bound]
                    window_words = pre_words + post_words
                    # tally up score for s 
                    sentiment_total = 0
                    sentiment_count = 0
                    for word in window_words:
                        if word in lex:
                            sentiment_total += lex[word]
                            sentiment_count += 1
                    if sentiment_count != 0:
                        avg_sentiment = sentiment_total/sentiment_count
                        if 'sent_' + sent in i_sent_dict:
                            i_sent_dict['sent_' + sent]['avg'] += avg_sentiment
                            i_sent_dict['sent_' + sent]['count'] +=  1
                        else:
                            i_sent_dict['sent_' + sent] = {'avg': avg_sentiment, 'count': 1}                   
        for sent_key in i_sent_dict.keys():
            df.loc[i, sent_key] = i_sent_dict[sent_key]['avg'] / i_sent_dict[sent_key]['count']
        df.fillna(0, inplace=True)
    return df

In [34]:
featurize_sentiments(df, p, 'label', 'tokenized_text_agg', ts_lex, 4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [6]:
# define parameters for feature generation
proto_word_args = {
    'text_col': 'full_text_agg', 
    'user_id': 'user_id', 
    'tok_type': 'clean', 
    'isalpha': True,
    'top_k': 100,
    'word_count_thresh': 5
}

hashtag_args = {
    'text_col': 'hashtags_agg', 
    'user_id': 'user_id',
    'top_k': 50,
    'ht_count_thresh': 3
}

topic_model_args = {
    'text_col': 'clean_text_agg',
    'user_id': 'user_id',
    'stops': nltk.corpus.stopwords.words('english') + ['rt'],
    'stemmer': nltk.stem.snowball.SnowballStemmer('english'), 
    'lemmer': None
}

topic_model_params = {
    'num_topic': 20, 
    'max_df': 0.5, 
    'min_df': 1, 
    'max_feature': 1000, 
    'alpha': 0.1, 
    'eta': 0.1,  
    'serialized': None 
}

In [7]:
X_train_ft, X_test_ft, y_train, y_test = feature.featurize(aggregated, 'label', proto_word_args, hashtag_args, topic_model_args, topic_model_params, 0.2, random_state=None, topic_words=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X_train[label] = y_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['tokenized_text'] = df[text_col].apply(lambda x: nltk.word_tokenize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['normalized_tokens'] = df['tokenized_text'].apply(lambda x: normalizeTokens(x, stopwordLst=stop_words, stemmer=stemmer, lemmer=lemmer))
A value is trying

### Classifiers

In [31]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [35]:
clfs = {'Random Forest': RandomForestClassifier(n_estimators=100, n_jobs=-1),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, subsample=0.5, max_depth=5),
        'AdaBoost': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=100),
        'Bagging, DT': BaggingClassifier(DecisionTreeClassifier(max_depth=1), max_samples=0.5, max_features=0.5),
        'Naive Bayes': GaussianNB(),        
        'Logistic Reg': LogisticRegression(penalty='l1', C=1e5),
        'SVM': SVC(kernel='rbf', probability=True, random_state=0),
        'Decision Tree': DecisionTreeClassifier()
            }

In [41]:
def basic_loop(clfs, X_train, y_train, X_test, y_test):
    for key, clf in clfs.items():
        print(key)
        start_time = time.time()
        clf_fit = clf.fit(X_train, y_train)
        print('Score: ', clf_fit.score(X_test, y_test))
        print("--- %s minutes ---" % round((time.time() - start_time)/60, 2))
        print()

In [42]:
basic_loop(clfs, X_train_ft, y_train, X_test_ft, y_test)

Random Forest
Score:  0.8782346685572492
--- 0.08 minutes ---

Gradient Boosting
Score:  0.8566111308046792
--- 0.69 minutes ---

AdaBoost
Score:  0.84774902516838
--- 0.27 minutes ---

Bagging, DT
Score:  0.848103509393832
--- 0.02 minutes ---

Naive Bayes
Score:  0.5233959588798298
--- 0.01 minutes ---

Logistic Reg
Score:  0.8459766040411202
--- 2.78 minutes ---

SVM
Score:  0.5235732009925558
--- 92.96 minutes ---

Decision Tree
Score:  0.8186813186813187
--- 0.02 minutes ---



In [23]:
clf = Perceptron().fit(X_train_ft, y_train)
clf.score(X_test_ft, y_test) 

0.47660404112017013

In [26]:
clf =SVC().fit(X_train_ft, y_train)
clf.score(X_test_ft, y_test) 

0.5235732009925558

In [2]:
a = pd.read_csv("X_train_ft.csv")

In [8]:
len(b.columns)

323