In this lab, we will
- read our project data into a Pandas DataFrame
- write a function to compute simple features for each row of the data frame
- fit a LogisticRegression model to the data
- print the top coefficients
- compute measures of accuracy

I've given you starter code below. You should:
- First, try to get it to work with your data. It may require changing the load_data file to match the requirements of your data (e.g., what is the object you are classifying -- a tweet, a user, a news article?)
- Second, you should add additional features to the make_features function:
  - Be creative. It could be additional word features, or other meta data about the user, date, etc.
- As you try out different feature combinations, print out the coefficients and accuracy scores
- List any features that seem to improve accuracy. Why do you think that is?

In [73]:
from collections import Counter
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [74]:
def load_data(datafile, checkfacts):
    """
    Read your data into a single pandas dataframe where
    - each row is an instance to be classified
    (this could be a tweet, user, or news article, depending on your project)
    - there is a column called `label` which stores the class label (e.g., the true
      category for this row)
    """
    df = pd.read_csv(datafile,dtype={'social_id': str })[['social_id','comment_tokens']]
    cf = pd.read_csv(checkfacts,dtype={'social_id': str })[['site','social_id','ruling']]
    
    df = pd.read_csv(datafile)[['social_id','comment_tokens']]
    cf = pd.read_csv(checkfacts)[['site','social_id','ruling']]
    cf = cf[cf['site'] == 'facebook']

    df.columns = ['item_id', 'content']
    cf.columns = ['site','item_id','label']
    
    df = df.drop_duplicates(['item_id','content'])
    df = pd.merge(cf,df,on=['item_id'],how = 'inner')
    
    df.loc[df['label'] =='False','label']='FALSE'
    df.loc[df['label'] =='True','label']='TRUE'
    df.loc[df['label'] =='Mostly False','label']='MOSTLY FALSE'
    df.loc[df['label'] =='Mostly True','label']='MOSTLY TRUE'
    
    return df

df = load_data('..\\..\\training_data\\facebook.csv.gz', '..\\..\\training_data\\factchecks.csv')
df.head()

  if self.run_code(code, result):


Unnamed: 0,site,item_id,label,content
0,facebook,sonoma.sheriff:::::1744890555535510,False,thank you sheriff
1,facebook,sonoma.sheriff:::::1744890555535510,False,"give em hell , sheriff ! thank you for keep ou..."
2,facebook,sonoma.sheriff:::::1744890555535510,False,sheriff giardano you be do a fantastic job tha...
3,facebook,sonoma.sheriff:::::1744890555535510,False,we appreciate and value you !
4,facebook,sonoma.sheriff:::::1744890555535510,False,"the people of sonoma county support you , sher..."


In [75]:
# what is the distribution over class labels?
print(df.label.value_counts())
print()
# Percentage statistics
print(df.label.value_counts(normalize=True))


FALSE             243741
MOSTLY FALSE      112454
Pants on Fire!     85484
TRUE               54826
MOSTLY TRUE        42246
Half-True          37096
MIXTURE            35604
Name: label, dtype: int64

FALSE             0.398627
MOSTLY FALSE      0.183913
Pants on Fire!    0.139805
TRUE              0.089665
MOSTLY TRUE       0.069091
Half-True         0.060669
MIXTURE           0.058229
Name: label, dtype: float64


In [49]:
def make_features(df):
    vec = DictVectorizer()
    feature_dicts = []
    # just as an initial example, we will consider three
    # word features in the model.
    words_to_track = ['certainly', 'impossible']
    for i, row in df.iterrows():
        features = {}
        token_counts = Counter(re.sub('\W+', ' ', row['content'].lower()).split())
        for w in words_to_track:
            features[w] = token_counts[w]
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
    return X, vec

df['content'] = df['content'].astype(str)
X, vec = make_features(df)

In [50]:
# what are dimensions of the feature matrix?
X.shape

(611451, 2)

In [51]:
# what are the feature names?
# vocabulary_ is a dict from feature name to column index
vec.vocabulary_

{'certainly': 0, 'impossible': 1}

In [52]:
# how often does each word occur?
for word, idx in vec.vocabulary_.items():
    print('%20s\t%d' % (word, X[:,idx].sum()))

           certainly	1672
          impossible	480


In [53]:
# can also get a simple list of feature names:
vec.get_feature_names()
# e.g., first column is 'hate', second is 'love', etc.

['certainly', 'impossible']

In [54]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)

Counter({'FALSE': 243741,
         'Pants on Fire!': 85484,
         'MOSTLY FALSE': 112454,
         'MIXTURE': 35604,
         'Half-True': 37096,
         'MOSTLY TRUE': 42246,
         'TRUE': 54826})

In [55]:
# to find the row indices with FALSE label
np.where(y=='FALSE')[0]

array([     0,      1,      2, ..., 325155, 325156, 325157], dtype=int64)

In [56]:
# store the class names
class_names = set(df.label)

In [57]:
# how often does each word appear in each class?
for word, idx in vec.vocabulary_.items():
    for class_name in class_names:
        class_idx = np.where(y==class_name)[0]
        print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

           certainly	               FALSE	346
           certainly	                TRUE	147
           certainly	      Pants on Fire!	352
           certainly	        MOSTLY FALSE	324
           certainly	           Half-True	214
           certainly	         MOSTLY TRUE	212
           certainly	             MIXTURE	77
          impossible	               FALSE	205
          impossible	                TRUE	17
          impossible	      Pants on Fire!	51
          impossible	        MOSTLY FALSE	86
          impossible	           Half-True	31
          impossible	         MOSTLY TRUE	72
          impossible	             MIXTURE	18


In [58]:
# fit a LogisticRegression classifier.
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [59]:
# for binary classification, LogisticRegression stores a single coefficient vector
clf.coef_
# this would be a matrix for a multi-class probem.

array([[-0.76065686,  0.17504027],
       [ 0.58806997,  0.12298106],
       [-0.33146673, -0.29184699],
       [-0.07083167,  0.08017021],
       [ 0.45063573,  0.7990349 ],
       [ 0.2741486 , -0.1759327 ],
       [-0.14989903, -0.70944676]])

In [60]:
# for binary classification, the coefficients for the negative class is just the negative of the positive class.
coef = -clf.coef_
print(coef)

[[ 0.76065686 -0.17504027]
 [-0.58806997 -0.12298106]
 [ 0.33146673  0.29184699]
 [ 0.07083167 -0.08017021]
 [-0.45063573 -0.7990349 ]
 [-0.2741486   0.1759327 ]
 [ 0.14989903  0.70944676]]


In [61]:
for ci, class_name in enumerate(clf.classes_):
    print('coefficients for %s' % class_name)
    display(pd.DataFrame([coef[ci]], columns=vec.get_feature_names()))

coefficients for FALSE


Unnamed: 0,certainly,impossible
0,0.760657,-0.17504


coefficients for Half-True


Unnamed: 0,certainly,impossible
0,-0.58807,-0.122981


coefficients for MIXTURE


Unnamed: 0,certainly,impossible
0,0.331467,0.291847


coefficients for MOSTLY FALSE


Unnamed: 0,certainly,impossible
0,0.070832,-0.08017


coefficients for MOSTLY TRUE


Unnamed: 0,certainly,impossible
0,-0.450636,-0.799035


coefficients for Pants on Fire!


Unnamed: 0,certainly,impossible
0,-0.274149,0.175933


coefficients for TRUE


Unnamed: 0,certainly,impossible
0,0.149899,0.709447


In [62]:
# sort coefficients by class.
features = vec.get_feature_names()
for ci, class_name in enumerate(clf.classes_):
    print('top features for class %s' % class_name)
    for fi in coef[ci].argsort()[::-1]: # descending order.
        print('%20s\t%.2f' % (features[fi], coef[ci][fi]))

top features for class FALSE
           certainly	0.76
          impossible	-0.18
top features for class Half-True
          impossible	-0.12
           certainly	-0.59
top features for class MIXTURE
           certainly	0.33
          impossible	0.29
top features for class MOSTLY FALSE
           certainly	0.07
          impossible	-0.08
top features for class MOSTLY TRUE
           certainly	-0.45
          impossible	-0.80
top features for class Pants on Fire!
          impossible	0.18
           certainly	-0.27
top features for class TRUE
          impossible	0.71
           certainly	0.15


In [63]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
for train, test in kf.split(X):
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    accuracies.append(accuracy_score(y[test], pred))
    
    
print('accuracy over all cross-validation folds: %s' % str(accuracies))
print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))

accuracy over all cross-validation folds: [0.3979197160870383, 0.39767765148417694, 0.4000899501185706, 0.39817646577806853, 0.39911685338130676]
mean=0.40 std=0.00
