In this lab, we will
- read our project data into a Pandas DataFrame
- write a function to compute simple features for each row of the data frame
- fit a LogisticRegression model to the data
- print the top coefficients
- compute measures of accuracy

I've given you starter code below. You should:
- First, try to get it to work with your data. It may require changing the load_data file to match the requirements of your data (e.g., what is the object you are classifying -- a tweet, a user, a news article?)
- Second, you should add additional features to the make_features function:
  - Be creative. It could be additional word features, or other meta data about the user, date, etc.
- As you try out different feature combinations, print out the coefficients and accuracy scores
- List any features that seem to improve accuracy. Why do you think that is?

In [36]:
from collections import Counter
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [69]:
def load_data(datafile, checkfile):
    """
    Read your data into a single pandas dataframe where
    - each row is an instance to be classified
    
    (this could be a tweet, user, or news article, depending on your project)
    - there is a column called `label` which stores the class label (e.g., the true
      category for this row)
    """
    df = pd.read_csv(datafile)[['social_id','comment_tokens']]
    ck = pd.read_csv(checkfile)
    
    ck = ck.loc[ck['site'] == 'twitter', ['site', 'social_id', 'ruling_val']]
    
    
    ck['social_id'] = ck['social_id'].astype(df['social_id'].dtype)
    
    df.columns = ['id', 'text']
    df = df.drop_duplicates(['id','text'])
    ck.columns = ['site','id','label']
#     ck['label'] = [i.lower() for i in ck.label]
    df = pd.merge(ck,df,on=['id'],how = 'inner')
    return df

df = load_data('..\\..\\training_data\\twitter.csv', '..\\..\\training_data\\factchecks.csv')
df.head()

Unnamed: 0,site,id,label,text
0,twitter,920307162278236160,-2.0,rt @allisonpohle : o-h-i-no ! this story wasn'...
1,twitter,920464477388029952,-2.0,"rt @theellenshow : tomorrow , the first people..."
2,twitter,918272604410122240,-2.0,rt @lauraloomer : investigative journalist lau...
3,twitter,918689132137791488,-2.0,rt @lauraloomer : #jesuscampos be miss ? laura...
4,twitter,913244272643653632,-2.0,rt @tacticalpoet84 : urlref\nfollow-up from my...


In [70]:
# what is the distribution over class labels?
df.label.value_counts()

-2.0    650
 0.0    201
-1.0    176
 2.0    141
 1.0     89
Name: label, dtype: int64

In [93]:
def tweet_tokenizer(s):
    s = re.sub(r'#(\S+)', r'HASHTAG_\1', s)
    s = re.sub(r'@(\S+)', r'MENTION_\1', s)
    s = re.sub(r'http\S+', 'THIS_IS_A_URL', s)
    return re.sub('\W+', ' ', s.lower()).split()

def counters(d):
    counts = Counter()  # handy object: dict from object -> int
    counts.update(d)
    return counts

tokens = [token for tweet in df['text'] for token in tweet_tokenizer(tweet)]
counts = counters(tokens)
words_to_track = [i[0] for i in counts.most_common(70)]
words_to_track = words_to_track[30:60]

In [94]:
def make_features(df):
    vec = DictVectorizer()
    feature_dicts = []
    # just as an initial example, we will consider three
    # word features in the model.
#     words_to_track = ['think', 'today', 'people'，'president', ]
    for i, row in df.iterrows():
        features = {}
        token_counts = Counter(re.sub('\W+', ' ', row['text'].lower()).split())
        for w in words_to_track:
            features[w] = token_counts[w]
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
    return X, vec
                
X, vec = make_features(df)

In [95]:
# what are dimensions of the feature matrix?
X.shape

(1257, 30)

In [96]:
# what are the feature names?
# vocabulary_ is a dict from feature name to column index
vec.vocabulary_

{'about': 0,
 'all': 1,
 'an': 2,
 'but': 3,
 'by': 4,
 'from': 5,
 'get': 6,
 'go': 7,
 'his': 8,
 'if': 9,
 'just': 10,
 'make': 11,
 'me': 12,
 'more': 13,
 'my': 14,
 'obama': 15,
 'one': 16,
 'our': 17,
 'out': 18,
 'people': 19,
 'say': 20,
 'should': 21,
 'take': 22,
 'than': 23,
 'they': 24,
 'u': 25,
 'up': 26,
 'what': 27,
 'will': 28,
 'would': 29}

In [97]:
# how often does each word occur?
for word, idx in vec.vocabulary_.items():
    print('%20s\t%d' % (word, X[:,idx].sum()))

                  my	82
                   u	80
                from	79
                  by	79
                 say	79
                they	73
                 our	66
                  if	64
                more	64
              people	63
               about	63
                  an	62
                 his	60
                will	60
                 but	57
                just	56
                what	55
               would	55
                 get	54
               obama	54
                  go	52
                 all	51
                 out	48
                take	48
                 one	47
                  up	46
                  me	46
                make	45
                than	45
              should	44


In [98]:
# can also get a simple list of feature names:
vec.get_feature_names()
# e.g., first column is 'hate', second is 'love', etc.

['about',
 'all',
 'an',
 'but',
 'by',
 'from',
 'get',
 'go',
 'his',
 'if',
 'just',
 'make',
 'me',
 'more',
 'my',
 'obama',
 'one',
 'our',
 'out',
 'people',
 'say',
 'should',
 'take',
 'than',
 'they',
 'u',
 'up',
 'what',
 'will',
 'would']

In [99]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)

Counter({-2.0: 650, -1.0: 176, 0.0: 201, 1.0: 89, 2.0: 141})

In [100]:
# to find the row indices with hostile label
np.where(y==0.0)[0]

array([ 826,  827,  828,  829,  830,  831,  832,  833,  834,  835,  836,
        837,  838,  839,  840,  841,  842,  843,  844,  845,  846,  847,
        848,  849,  850,  851,  852,  853,  854,  855,  856,  857,  858,
        859,  860,  861,  862,  863,  864,  865,  866,  867,  868,  869,
        870,  871,  872,  873,  874,  875,  876,  877,  878,  879,  880,
        881,  882,  884,  885,  886,  887,  888,  889,  890,  891,  892,
        893,  894,  895,  896,  897,  898,  899,  900,  901,  902,  903,
        904,  905,  906,  907,  908,  909,  910,  911,  912,  913,  914,
        915,  916,  917,  918,  919,  920,  921,  922,  923,  924,  925,
        926,  927,  928,  929,  930,  931,  932,  933,  934,  935,  936,
        937,  938,  939,  940,  941,  942,  943,  944,  945,  946,  947,
        948,  949,  950,  951,  952,  953,  954,  955,  956,  957,  958,
        959,  960,  961,  962,  963,  964,  965,  966,  967,  968,  969,
        970,  971,  972,  973,  974,  975,  976,  9

In [101]:
# store the class names
class_names = set(df.label)

In [102]:
# how often does each word appear in each class?
for word, idx in vec.vocabulary_.items():
    for class_name in class_names:
        class_idx = np.where(y==class_name)[0]
        print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

                  my	                 0.0	18
                  my	                 1.0	1
                  my	                 2.0	6
                  my	                -1.0	8
                  my	                -2.0	49
                   u	                 0.0	12
                   u	                 1.0	17
                   u	                 2.0	1
                   u	                -1.0	13
                   u	                -2.0	37
                from	                 0.0	11
                from	                 1.0	5
                from	                 2.0	14
                from	                -1.0	12
                from	                -2.0	37
                  by	                 0.0	11
                  by	                 1.0	8
                  by	                 2.0	12
                  by	                -1.0	14
                  by	                -2.0	34
                 say	                 0.0	9
                 say	                 1.0	6
                 s

So, `you` appears more frequently in positive (hostile) class, and `love` appears more frequently in the negative (non-hostile) class.

In [106]:
# fit a LogisticRegression classifier.
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [107]:
# for binary classification, LogisticRegression stores a single coefficient vector
clf.coef_
# this would be a matrix for a multi-class probem.

array([[ 4.66264586e-01,  2.18832168e-01,  4.30355060e-01,
        -6.87737434e-02, -2.98443577e-01, -6.44924736e-02,
         4.49855891e-01,  1.85887022e-01,  5.40971027e-02,
        -1.85057366e-01,  6.52555348e-02, -1.75669454e-01,
         1.17144906e-01, -1.13060551e-01,  3.56406255e-01,
        -1.86299058e-01, -2.61485205e-01, -2.02570081e-01,
         3.46517894e-01,  1.52334089e-01,  1.03367641e-01,
        -1.79569278e-01, -3.42108893e-01, -8.33473442e-01,
        -6.82376290e-02, -5.02439352e-02, -3.24118046e-01,
        -8.04844260e-03,  7.25513050e-02,  1.08527476e-01],
       [-1.78385802e-01, -2.85199417e-01,  1.57857473e-01,
         2.74415140e-02,  1.05392800e-01,  4.68731907e-02,
         5.17749842e-01,  3.62592283e-01,  2.89923932e-02,
         2.97060486e-01, -3.91642852e-02, -3.29485105e-01,
        -6.15299110e-01, -1.42563528e-01, -1.90914969e-02,
         1.42745390e-01,  9.19296707e-02,  5.20251790e-02,
        -7.67960678e-02, -1.70918048e-01,  9.52550017e-

In [108]:
# for binary classification, the coefficients for the negative class is just the negative of the positive class.
coef = clf.coef_
print(coef)

[[ 4.66264586e-01  2.18832168e-01  4.30355060e-01 -6.87737434e-02
  -2.98443577e-01 -6.44924736e-02  4.49855891e-01  1.85887022e-01
   5.40971027e-02 -1.85057366e-01  6.52555348e-02 -1.75669454e-01
   1.17144906e-01 -1.13060551e-01  3.56406255e-01 -1.86299058e-01
  -2.61485205e-01 -2.02570081e-01  3.46517894e-01  1.52334089e-01
   1.03367641e-01 -1.79569278e-01 -3.42108893e-01 -8.33473442e-01
  -6.82376290e-02 -5.02439352e-02 -3.24118046e-01 -8.04844260e-03
   7.25513050e-02  1.08527476e-01]
 [-1.78385802e-01 -2.85199417e-01  1.57857473e-01  2.74415140e-02
   1.05392800e-01  4.68731907e-02  5.17749842e-01  3.62592283e-01
   2.89923932e-02  2.97060486e-01 -3.91642852e-02 -3.29485105e-01
  -6.15299110e-01 -1.42563528e-01 -1.90914969e-02  1.42745390e-01
   9.19296707e-02  5.20251790e-02 -7.67960678e-02 -1.70918048e-01
   9.52550017e-03  6.89788442e-02  3.08925809e-01 -4.04516871e-01
   4.61071862e-02  2.05741359e-01  4.83640789e-02  6.54597477e-01
  -3.03036164e-01 -2.15438898e-01]
 [-5.6

In [109]:
for ci, class_name in enumerate(clf.classes_):
    print('coefficients for %s' % class_name)
    display(pd.DataFrame([coef[ci]], columns=vec.get_feature_names()))

coefficients for -2.0


Unnamed: 0,about,all,an,but,by,from,get,go,his,if,...,say,should,take,than,they,u,up,what,will,would
0,0.466265,0.218832,0.430355,-0.068774,-0.298444,-0.064492,0.449856,0.185887,0.054097,-0.185057,...,0.103368,-0.179569,-0.342109,-0.833473,-0.068238,-0.050244,-0.324118,-0.008048,0.072551,0.108527


coefficients for -1.0


Unnamed: 0,about,all,an,but,by,from,get,go,his,if,...,say,should,take,than,they,u,up,what,will,would
0,-0.178386,-0.285199,0.157857,0.027442,0.105393,0.046873,0.51775,0.362592,0.028992,0.29706,...,0.009526,0.068979,0.308926,-0.404517,0.046107,0.205741,0.048364,0.654597,-0.303036,-0.215439


coefficients for 0.0


Unnamed: 0,about,all,an,but,by,from,get,go,his,if,...,say,should,take,than,they,u,up,what,will,would
0,-0.566699,-0.347385,-0.439238,0.224935,-0.187657,-0.170492,-0.079946,-0.24304,0.216972,0.294664,...,-0.295379,-0.064887,0.251581,0.215307,0.402827,0.005863,0.21081,-0.390824,-0.213693,-0.119234


coefficients for 1.0


Unnamed: 0,about,all,an,but,by,from,get,go,his,if,...,say,should,take,than,they,u,up,what,will,would
0,0.076535,0.303181,0.595622,-0.391475,0.194439,-0.173341,0.003672,-0.346679,-0.125167,-0.577183,...,0.171467,-0.054822,-0.251419,0.989589,0.292433,1.100879,0.179742,0.166191,0.443104,0.475597


coefficients for 2.0


Unnamed: 0,about,all,an,but,by,from,get,go,his,if,...,say,should,take,than,they,u,up,what,will,would
0,0.202285,0.110571,-0.744596,0.207872,0.186268,0.361452,-0.891332,0.041239,-0.174894,0.170517,...,0.011018,0.2303,0.033021,0.033094,-0.67313,-1.26224,-0.114799,-0.421916,0.001073,-0.249452


In [118]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
for train, test in kf.split(X):
   clf.fit(X[train], y[train])
   pred = clf.predict(X[test])
   accuracies.append(accuracy_score(y[test], pred))
print('accuracy over all cross-validation folds: %s' % str(accuracies))
print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))

accuracy over all cross-validation folds: [0.5515873015873016, 0.5119047619047619, 0.5219123505976095, 0.50199203187251, 0.4900398406374502]
mean=0.52 std=0.02
