### Create features 

In [33]:
import pandas as pd
import pickle
import operator

In [2]:
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [3]:
with open('../data_preprocessing/data/svo_df.pkl', 'rb') as f:
    df = pickle.load(f)

In [74]:
ts_lex = {}
with open('./data/ts_lex.txt','r',encoding='utf-8') as f:
    for line in f:
        entry = line.split(' ')
        ts_lex[entry[0]] = float(entry[1])

In [172]:
def svo_sentiments(df, lex, label_col, window, count_thresh):
    
    # get sentiment around svo (aggregate and count of tweets)
    svo_sentiments = {
        'subject': {},
        'object': {}
    }
    
    labels = df[label_col].unique()
    for label in labels:
        for key in svo_sentiments.keys():
            svo_sentiments[key][label] = {}
       
        # all svos for one label
        label_svos = df[df[label_col] == label].reset_index()
        for i in range(label_svos.shape[0]):
            user_svo_list = label_svos.loc[i,'svos']
            for tweet_svo_list in user_svo_list:
                for svo in tweet_svo_list:
                    s = svo[0]
                    o = svo[2]
                    # checking s and o in context
                    tokenized_text_lists = label_svos.loc[i,'tokenized_text_agg']
                    for tokenized_text in tokenized_text_lists:
                        
                        # for 's'ubjects
                        if s != '' and '/' not in s:
                            if s in tokenized_text:
                                s_index = tokenized_text.index(s)
                               
                                # get words in window
                                lower_bound = max(0, s_index-window)
                                upper_bound = min(len(tokenized_text), s_index+1+window)
                                pre_words = tokenized_text[lower_bound : s_index]
                                post_words = tokenized_text[s_index+1: upper_bound]
                                window_words = pre_words + post_words
                                
                                # tally up sentiment score for s 
                                sentiment_total = 0
                                sentiment_count = 0
                                for word in window_words:
                                    if word in lex:
                                        sentiment_total += lex[word]
                                        sentiment_count += 1
                                
                                # build dictionary
                                if sentiment_count != 0:
                                    avg_sentiment = sentiment_total/sentiment_count
                                    if s in svo_sentiments['subject'][label]:
                                        svo_sentiments['subject'][label][s]['total_sentim'] += avg_sentiment
                                        svo_sentiments['subject'][label][s]['count'] += 1    
                                    else:
                                        svo_sentiments['subject'][label][s] = {'total_sentim': avg_sentiment, 'count': 1}
                                else:
                                    continue
                        
                        # for 'o'bjects
                        if o != '' and '/' not in o:
                            if o in tokenized_text:
                                o_index = tokenized_text.index(o)
                               
                                # get words in window
                                lower_bound = max(0, o_index-window)
                                upper_bound = min(len(tokenized_text), o_index+1+window)
                                pre_words = tokenized_text[lower_bound : o_index]
                                post_words = tokenized_text[o_index+1: upper_bound]
                                window_words = pre_words + post_words
                                
                                # tally up score for s 
                                sentiment_total = 0
                                sentiment_count = 0
                                for word in window_words:
                                    if word in lex:
                                        sentiment_total += lex[word]
                                        sentiment_count += 1
                                
                                if sentiment_count != 0:
                                    avg_sentiment = sentiment_total/sentiment_count
                                    if o in svo_sentiments['object'][label]:
                                        svo_sentiments['object'][label][o]['total_sentim'] += avg_sentiment
                                        svo_sentiments['object'][label][o]['count'] += 1    
                                    else:
                                        svo_sentiments['object'][label][o] = {'total_sentim': avg_sentiment, 'count': 1}
                                else:
                                    continue
    
    # get average sentiment (if count above threshold) per label, per word 
    sentiments = {
        'subject': {},
        'object': {}
    }
    for label in labels:
        for key in sentiments.keys():
            sentiments[key][label] = {}
            
    for sv in svo_sentiments.keys():
        for label_key in svo_sentiments[sv]:
            for word_key in svo_sentiments[sv][label_key]:
                if svo_sentiments[sv][label_key][word_key]['count'] >= count_thresh:
                    sentiments[sv][label_key][word_key] = svo_sentiments[sv][label_key][word_key]['total_sentim'] / svo_sentiments[sv][label_key][word_key]['count']

    return sentiments, svo_sentiments

In [201]:
s, ss =svo_sentiments(df, ts_lex, 'label', 4, 6)

In [206]:
def most_pos_neg_sents(sent_dict, k):
    k_sentiment_dict = {}
    for sv in sent_dict.keys():
        for label in sent_dict[sv].keys():
            k_sentiment_dict[label] = {}
#             k_sentiment_dict[label][sv] = {}
            sorted_list = sorted(sent_dict[sv][label].items(), key=operator.itemgetter(1))
            sorted_list.reverse()
            # get k most positive, 
            kpos_list = sorted_list[:k] 
            kpos_list = [pair[0] for pair in kpos_list]
            if 'POSITIVE' in k_sentiment_dict[label].keys():
                k_sentiment_dict[label]['POSITIVE'] += kpos_list
            else:
                k_sentiment_dict[label]['POSITIVE'] = kpos_list
            
            # get k most negative words
            kneg_list = sorted_list[len(sorted_list)-k :]
            kneg_list = [pair[0] for pair in kneg_list]
            if 'NEGATIVE' in k_sentiment_dict[label].keys():
                k_sentiment_dict[label]['NEGATIVE'] += kneg_list
            else:
                k_sentiment_dict[label]['NEGATIVE'] = kneg_list
            
#     for label in k_sentiment_dict.keys():
#         for sv in k_sentiment_dict[label].keys():
            
    return k_sentiment_dict

In [207]:
most_pos_neg_sents(s, 20)

{'d': {'NEGATIVE': ['fact',
   'philosophy',
   'senator',
   'economy',
   'control',
   'center',
   'themselves',
   'violence',
   'workers',
   'law',
   'press',
   'professionals',
   'row',
   'email',
   'role',
   'crisis',
   'opposition',
   'morning',
   'the…',
   'teenager'],
  'POSITIVE': ['you…',
   'bloomberg',
   'movement',
   'topics',
   'tonight.',
   'fun',
   'us—it',
   'kick-ass',
   'yorker',
   'conscience',
   'friend',
   'boo',
   'sign',
   'family',
   'border',
   'choice',
   'message',
   'dress',
   'maga',
   'lady']},
 'r': {'NEGATIVE': ['course',
   'protesters',
   'businesses',
   'journalists',
   'rally',
   'attacks',
   'animals',
   'funding',
   'checkers',
   'control',
   'h…',
   'refugees',
   'group',
   'history',
   'chat',
   'blacks',
   'racism',
   'surgery',
   'lights',
   'porn'],
  'POSITIVE': ['natives',
   'chris',
   'movement',
   'lady',
   'champion',
   'fan',
   'event',
   'conscience',
   'fightback',
   'rt',
  

In [199]:
most_pos_neg_sents(s, 20)

{'d': {'object': {'NEGATIVE': ['fact',
    'philosophy',
    'senator',
    'economy',
    'control',
    'center',
    'themselves',
    'violence',
    'workers',
    'law',
    'press',
    'professionals',
    'row',
    'email',
    'role',
    'crisis',
    'opposition',
    'morning',
    'the…',
    'teenager'],
   'POSITIVE': ['you…',
    'bloomberg',
    'movement',
    'topics',
    'tonight.',
    'fun',
    'us—it',
    'kick-ass',
    'yorker',
    'conscience',
    'friend',
    'boo',
    'sign',
    'family',
    'border',
    'choice',
    'message',
    'dress',
    'maga',
    'lady']}},
 'r': {'object': {'NEGATIVE': ['course',
    'protesters',
    'businesses',
    'journalists',
    'rally',
    'attacks',
    'animals',
    'funding',
    'checkers',
    'control',
    'h…',
    'refugees',
    'group',
    'history',
    'chat',
    'blacks',
    'racism',
    'surgery',
    'lights',
    'porn'],
   'POSITIVE': ['natives',
    'chris',
    'movement',
    'lad

In [186]:
s

{'object': {'d': {'!': 0.16511259333333336,
   '%': 0.2947949551282051,
   "'s": -0.30526925000000005,
   'access': -0.20673369047619047,
   'act': 0.27183870555555567,
   'actors': -0.14519385,
   'america': 0.16246241666666664,
   'american': 0.35035742592592595,
   'americans': -0.12426704166666669,
   'amp': 0.20866940307377063,
   'anyone': 0.25396687878787877,
   'anything': 0.18515841666666666,
   'ass': 0.2948421666666667,
   'balloon': 0.37660632499999996,
   'banks': 0.11465423958333333,
   'bern': -0.2832115,
   'bernie': -0.003115312500000028,
   'berniesanders': 0.22744127311827958,
   'bill': 0.20979444444444442,
   'birth': 0.07615349999999999,
   'bloomberg': 0.8247366111111112,
   'boo': 0.6852041666666667,
   'boos': 0.3048758888888889,
   'boots': 0.07179318333333334,
   'border': 0.65883225,
   'brazile': 0.16132435714285714,
   'bunch': 0.4058383055555555,
   'business': 0.10414090277777772,
   'call': 0.41905995,
   'campaign': 0.39832619999999996,
   'candidate':

In [183]:
sorted_svo_list

[('ddlovato', 0.871744784090909),
 ('poll', 0.8463194999999999),
 ('.flotus', 0.8149830000000001),
 ('reform', 0.8003698333333333),
 ('train', 0.7615953823529411),
 ('thanks', 0.7502806428571428),
 ('fact', 0.6991355874999999),
 ('leak', 0.6938968636363636),
 ('silverman', 0.6752535250000001),
 ('kaine', 0.6656406360082311),
 ('call', 0.6649735833333332),
 ('week', 0.6617007142857142),
 ('chelsea', 0.6613483749999999),
 ('problem', 0.65980609375),
 ('watch', 0.6562660303030302),
 ('granholm', 0.6490747916666667),
 ('tonight', 0.6423980416666667),
 ('moment', 0.6255205500000001),
 (']', 0.623372893939394),
 ('her', 0.6230688262121212),
 ('wall', 0.6219847142857144),
 ('somebody', 0.6127286666666666),
 ('bloomberg', 0.6082159264705882),
 ('let', 0.5804925833333333),
 ('🔦this', 0.5799690000000001),
 ('looks', 0.5744452083333333),
 ('flotus', 0.55437675),
 ('speech', 0.5363816000000003),
 ('streep', 0.5232797285714286),
 ('%', 0.5151539032258066),
 ('everything', 0.5150088333333334),
 ('ch

In [184]:
sorted_svo_list = sorted(s['subject']['r'].items(), key=operator.itemgetter(1))
sorted_svo_list.reverse()

In [185]:
sorted_svo_list

[('+', 0.9462980952380953),
 ('demeanor', 0.9105356666666671),
 ('mongols', 0.869992),
 ('michelleobama', 0.7992399166666666),
 ('trump2016', 0.7661866592592593),
 ('please', 0.7444033846153846),
 (']', 0.7404247777777777),
 ('hell', 0.7132058749999999),
 ('walker', 0.7107998333333333),
 ('https…', 0.6821358636363637),
 ('thanks', 0.6818505757575757),
 ('chris', 0.6529306944444444),
 ('looks', 0.6383040052083334),
 ("'i", 0.6356741388888889),
 ('usa', 0.6160760833333334),
 ('rnc2016', 0.610170375),
 ('thiel', 0.5704917407407406),
 ('drjillstein', 0.555955),
 ('pledge', 0.5498673888888889),
 ('world', 0.5481190384615384),
 ('husband', 0.5468627222222222),
 ('band', 0.5369122500000001),
 ('rights', 0.5367412500000001),
 ('cotton', 0.5310352500000001),
 ('check', 0.5101350833333334),
 ('sessions', 0.5004350694444445),
 ('johnson', 0.49159206666666666),
 ('word', 0.49059188888888894),
 ('story', 0.4807294166666667),
 ('baio', 0.4804442058823528),
 ('re', 0.47406181249999996),
 ('football',

In [53]:
def svo_counts(df, label_col):
    svo_counts = {
        'subject': {'total': {}},
        'verb': {'total': {}},
        'object': {'total': {}}
    }
    
    labels = df[label_col].unique()
    for label in labels:
        for svo_key in svo_counts.keys():
            svo_counts[svo_key][label] = {}

        # all svos for one label
        label_svos = df[df[label_col] == label].reset_index()
        for i in range(label_svos.shape[0]):
            user_svo_list = label_svos.loc[i,'svos']
            for tweet_svo_list in user_svo_list:
                for svo in tweet_svo_list:
                    for j in range(3):
                        word = svo[j]
                        if j == 0:
                            svo_key = 'subject'
                        elif j == 1:
                            svo_key = 'verb'
                        elif j == 2:
                            svo_key = 'object'
                        
                        # build dictionary
                        if word in svo_counts[svo_key]['total']:
                            svo_counts[svo_key]['total'][word] += 1
                            if word in svo_counts[svo_key][label]:
                                svo_counts[svo_key][label][word] += 1
                            else:
                                svo_counts[svo_key][label][word] = 1
                        else:
                            svo_counts[svo_key]['total'][word] = 1
                            svo_counts[svo_key][label][word] = 1                    
    return svo_counts


In [54]:
c = svo_counts(df, 'label')

In [142]:
c

{'object': {'d': {'': 17836,
   'lot': 39,
   'narrative': 8,
   'heart': 16,
   'man': 67,
   'woman': 63,
   'votes': 31,
   'nomination': 42,
   '//t.co/dyyxix9tm6': 1,
   'part': 29,
   'yourself': 3,
   'platform': 17,
   'you': 227,
   '//t.co/rili4eqxvf': 1,
   'https': 1194,
   '//t.co/39ghe5…': 3,
   'dnc': 139,
   'voters': 11,
   'dogs': 1,
   'demsinphilly': 293,
   'amp': 252,
   'things': 32,
   'campaign': 11,
   'them': 88,
   'lady': 11,
   'cnn': 13,
   'dream': 5,
   'something': 42,
   'moment': 36,
   '//t.co/j59tsmwcde': 1,
   '//t.co/ehg96ihyv1': 1,
   'us': 162,
   'cbseveningnews': 1,
   '//t.co/g0z8drzvgp': 1,
   'thebigotticket': 1,
   '//t.co/hzw4ydforv': 1,
   '//t.co/tic0ezj8zc': 1,
   'tacks': 1,
   '//t.co/bgtjyfjut0': 1,
   'character': 1,
   'bag': 3,
   'anything': 60,
   'star': 5,
   'stfu': 1,
   'right': 27,
   'criminalhillary..so': 1,
   'nation': 24,
   '//t.co/ylane9…': 1,
   'democrats': 18,
   'everyone': 18,
   'vote': 60,
   'time': 76,
  

In [55]:
def svo_lists(svo_counts, labels, top_k):
    svo_list_dict = {}
    svo_only_label_dict = {}
    
    for svo_key in svo_counts.keys():
        # find top k words per label
        svo_list_dict[svo_key] = {}
        svo_only_label_dict[svo_key] = {}
        for label in labels:            
            sorted_svo_list = sorted(svo_counts[svo_key][label].items(), key=operator.itemgetter(1))
            sorted_svo_list.reverse()
            svo_list_dict[svo_key][label] = sorted_svo_list[: top_k]
        
        # find unique and common words between labels (only works for 2 labels!)
        label1_words = [x[0] for x in svo_list_dict[svo_key][labels[0]]]
        label2_words = [x[0] for x in svo_list_dict[svo_key][labels[1]]]
        label1_unique = [word for word in label1_words if word not in label2_words]
        label2_unique = [word for word in label2_words if word not in label1_words]
        common = [word for word in label1_words if word in label2_words]
        svo_only_label_dict[svo_key][labels[0]] = label1_unique
        svo_only_label_dict[svo_key][labels[1]] = label2_unique
        svo_only_label_dict[svo_key]['common in top ' + str(top_k)] = common
            
    return svo_list_dict, svo_only_label_dict
        

In [60]:
svo_list_dict, svo_only_label_dict = svo_lists(c, ['d','r'], 20)

In [62]:
def create_sentiment_features(df, svo_only_label_dict, window):
    
    # subject
    
    

In [59]:
d2

{'object': {'common in top 20': ['',
   'https',
   'it',
   'convention',
   'amp',
   'you',
   'trump',
   'speech',
   'us',
   'president',
   'me',
   'people',
   'her'],
  'd': ['demsinphilly',
   'clinton',
   'dnc',
   'demconvention',
   'night',
   'them',
   'party'],
  'r': ['rncincle',
   'gopconvention',
   'america',
   'time',
   'him',
   'graph',
   'http…']},
 'subject': {'common in top 20': ['',
   'we',
   'you',
   'it',
   'they',
   'she',
   'he',
   'https',
   'trump',
   'people',
   'convention',
   '’'],
  'd': ['dnc',
   'clinton',
   'obama',
   'i',
   'hillaryclinton',
   'demsinphilly',
   'hillary',
   'supporters'],
  'r': ['rncincle',
   'cruz',
   'gop',
   'melania',
   'speech',
   'america',
   'realdonaldtrump',
   'republicans']},
 'verb': {'common in top 20': ['be ',
   '',
   "'s ",
   'have ',
   'say ',
   'get ',
   'do ',
   'make ',
   'https ',
   'want ',
   'speak ',
   'go ',
   'see ',
   'know ',
   'think ',
   'watch ',
   'n

In [6]:
# define parameters for feature generation
proto_word_args = {
    'text_col': 'full_text_agg', 
    'user_id': 'user_id', 
    'tok_type': 'clean', 
    'isalpha': True,
    'top_k': 100,
    'word_count_thresh': 5
}

hashtag_args = {
    'text_col': 'hashtags_agg', 
    'user_id': 'user_id',
    'top_k': 50,
    'ht_count_thresh': 3
}

topic_model_args = {
    'text_col': 'clean_text_agg',
    'user_id': 'user_id',
    'stops': nltk.corpus.stopwords.words('english') + ['rt'],
    'stemmer': nltk.stem.snowball.SnowballStemmer('english'), 
    'lemmer': None
}

topic_model_params = {
    'num_topic': 20, 
    'max_df': 0.5, 
    'min_df': 1, 
    'max_feature': 1000, 
    'alpha': 0.1, 
    'eta': 0.1,  
    'serialized': None 
}

In [7]:
X_train_ft, X_test_ft, y_train, y_test = feature.featurize(aggregated, 'label', proto_word_args, hashtag_args, topic_model_args, topic_model_params, 0.2, random_state=None, topic_words=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X_train[label] = y_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['tokenized_text'] = df[text_col].apply(lambda x: nltk.word_tokenize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['normalized_tokens'] = df['tokenized_text'].apply(lambda x: normalizeTokens(x, stopwordLst=stop_words, stemmer=stemmer, lemmer=lemmer))
A value is trying

### Classifiers

In [31]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [35]:
clfs = {'Random Forest': RandomForestClassifier(n_estimators=100, n_jobs=-1),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, subsample=0.5, max_depth=5),
        'AdaBoost': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=100),
        'Bagging, DT': BaggingClassifier(DecisionTreeClassifier(max_depth=1), max_samples=0.5, max_features=0.5),
        'Naive Bayes': GaussianNB(),        
        'Logistic Reg': LogisticRegression(penalty='l1', C=1e5),
        'SVM': SVC(kernel='rbf', probability=True, random_state=0),
        'Decision Tree': DecisionTreeClassifier()
            }

In [41]:
def basic_loop(clfs, X_train, y_train, X_test, y_test):
    for key, clf in clfs.items():
        print(key)
        start_time = time.time()
        clf_fit = clf.fit(X_train, y_train)
        print('Score: ', clf_fit.score(X_test, y_test))
        print("--- %s minutes ---" % round((time.time() - start_time)/60, 2))
        print()

In [42]:
basic_loop(clfs, X_train_ft, y_train, X_test_ft, y_test)

Random Forest
Score:  0.8782346685572492
--- 0.08 minutes ---

Gradient Boosting
Score:  0.8566111308046792
--- 0.69 minutes ---

AdaBoost
Score:  0.84774902516838
--- 0.27 minutes ---

Bagging, DT
Score:  0.848103509393832
--- 0.02 minutes ---

Naive Bayes
Score:  0.5233959588798298
--- 0.01 minutes ---

Logistic Reg
Score:  0.8459766040411202
--- 2.78 minutes ---

SVM
Score:  0.5235732009925558
--- 92.96 minutes ---

Decision Tree
Score:  0.8186813186813187
--- 0.02 minutes ---



In [23]:
clf = Perceptron().fit(X_train_ft, y_train)
clf.score(X_test_ft, y_test) 

0.47660404112017013

In [26]:
clf =SVC().fit(X_train_ft, y_train)
clf.score(X_test_ft, y_test) 

0.5235732009925558

In [2]:
a = pd.read_csv("X_train_ft.csv")

In [8]:
len(b.columns)

323