In this lab, we will
- read our project data into a Pandas DataFrame
- write a function to compute simple features for each row of the data frame
- fit a LogisticRegression model to the data
- print the top coefficients
- compute measures of accuracy

I've given you starter code below. You should:
- First, try to get it to work with your data. It may require changing the load_data file to match the requirements of your data (e.g., what is the object you are classifying -- a tweet, a user, a news article?)
- Second, you should add additional features to the make_features function:
  - Be creative. It could be additional word features, or other meta data about the user, date, etc.
- As you try out different feature combinations, print out the coefficients and accuracy scores
- List any features that seem to improve accuracy. Why do you think that is?

In [1]:
from collections import Counter
import numpy as np
import pandas as pd
import re
import glob
import gzip
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [162]:
def load_data(datafile):
    """
    Read your data into a single pandas dataframe where
    - each row is an instance to be classified
    (this could be a tweet, user, or news article, depending on your project)
    - there is a column called `label` which stores the class label (e.g., the true
      category for this row)
    """
    bots = []
    humans = []
    folder = ['/bots', '/humans']
    name = '/*.json.gz'
    for f in folder:
        paths = glob.glob(datafile + f + name)
        for p in paths:
            with gzip.open(p, 'r') as file:
                for line in file:
                    if f == folder[0]:
                        bots.append(json.loads(line))
                    elif f == folder[1]:
                        humans.append(json.loads(line))
    bots_tweets = [tweet for b in bots if 'tweets' in b for tweet in b['tweets']]
    df_bots = pd.DataFrame(bots_tweets)[['full_text']]
    df_bots['label'] = 'bot'
    bots_text_length = [len(t) for t in df_bots['full_text']]
    df_bots['text_length'] = bots_text_length

    humans_tweets = [tweet for h in humans if 'tweets' in h for tweet in h['tweets']]
    df_humans = pd.DataFrame(humans_tweets)[['full_text']]
    df_humans['label'] = 'human'
    humans_text_length = [len(t) for t in df_humans['full_text']]
    df_humans['text_length'] = humans_text_length

    frames = [df_bots, df_humans]
    df = pd.concat(frames)
    
    df.full_text = df.full_text.astype(str)
    df['text_length>100'] = df['text_length']>100
    df['contains_URL'] = df['full_text'].str.contains('http')
    df['contains_@'] = df['full_text'].str.contains('@')
    return df
# df = load_data('~/Dropbox/elevate/harassment/training_data/data.csv.gz')
df = load_data('/Users/sheepman/Downloads/bots/small')
df


Unnamed: 0,full_text,label,text_length,text_length>100,contains_URL,contains_@
0,You heard me! Shoot me.,bot,23,False,False,False
1,"Junpei, you...",bot,14,False,False,False
2,We did exactly what zero wanted?,bot,32,False,False,False
3,"Let's protect Akane, no matter what.",bot,36,False,False,False
4,Okay! Let's do this then.,bot,25,False,False,False
5,"Hurry Junpei! What was your ""idea""",bot,34,False,False,False
6,"Yeah, what she said!",bot,20,False,False,False
7,Maria! Maria is that you? Just stay there! I'm...,bot,65,False,False,False
8,"Carlos had nobody to go to prom with, so he br...",bot,59,False,False,False
9,"No, I really don't think it wants to win at ro...",bot,64,False,False,False


In [163]:
# what is the distribution over class labels?
df.label.value_counts()
df.dtypes

full_text          object
label              object
text_length         int64
text_length>100      bool
contains_URL         bool
contains_@           bool
dtype: object

In [142]:
def make_features(df):
    vec = DictVectorizer()
    feature_dicts = []
    labels_to_track = ['long_text', 'contains_URL']
    for i, row in df.iterrows():
        features = {}
        if (row['text_length>100']):
            features['long_text'] = 1
        else:
            features['long_text'] = 0
        if (row['contains_URL']):
            features['contains_URL'] = 1
        else:
            features['contains_URL'] = 0
        if (row['contains_@']):
            features['contains_@'] = 1
        else:
            features['contains_@'] = 0
        feature_dicts.append(features)
    print(feature_dicts)
    X = vec.fit_transform(feature_dicts)
#     print(X)
    return X, vec

X, vec = make_features(df)
# print(sum((df['label']=='bot') & (df['text_length>100'])))
# print(sum((df['label']=='human') & (df['text_length>100'])))
# print(sum((df['label']=='bot') & (df['contains_URL'])))
# print(sum((df['label']=='human') & (df['contains_URL'])))
# print(sum((df['label']=='bot') & (df['contains_URL'])  & (df['text_length>100'])))
# print(sum((df['label']=='human') & (df['contains_URL'])  & (df['text_length>100'])))


# X, vec = make_features(df)

[{'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 1, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 1, 'contains_@': 0}, {'long_text': 0, 'contains_URL': 0, 'contains_@': 0}, {'long_text': 0, 'contains_

In [143]:
# what are dimensions of the feature matrix?
X.shape


(39989, 3)

In [144]:
# what are the feature names?
# vocabulary_ is a dict from feature name to column index
vec.vocabulary_

{'long_text': 2, 'contains_URL': 1, 'contains_@': 0}

In [145]:
# how often does each word occur?
for word, idx in vec.vocabulary_.items():
    print('%20s\t%d' % (word, X[:,idx].sum()))

           long_text	20456
        contains_URL	18295
          contains_@	14052


In [146]:
# can also get a simple list of feature names:
vec.get_feature_names()
# e.g., first column is 'hate', second is 'love', etc.

['contains_@', 'contains_URL', 'long_text']

In [164]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)

Counter({'bot': 19991, 'human': 19998})

In [166]:
# to find the row indices with hostile label
np.where(y=='bot')[0]
# np.where(y=='human')[0]

array([    0,     1,     2, ..., 19988, 19989, 19990])

In [167]:
# store the class names
class_names = set(df.label)

In [168]:
# how often does each word appear in each class?
for word, idx in vec.vocabulary_.items():
    for class_name in class_names:
        class_idx = np.where(y==class_name)[0]
        print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

           long_text	               human	12515
           long_text	                 bot	7941
        contains_URL	               human	10983
        contains_URL	                 bot	7312
          contains_@	               human	12246
          contains_@	                 bot	1806


So, `you` appears more frequently in positive (hostile) class, and `love` appears more frequently in the negative (non-hostile) class.

In [169]:
# fit a LogisticRegression classifier.
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [170]:
# for binary classification, LogisticRegression stores a single coefficient vector
clf.coef_
# this would be a matrix for a multi-class probem.

array([[3.09091937, 1.3586826 , 0.42282148]])

In [171]:
# for binary classification, the coefficients for the negative class is just the negative of the positive class.
coef = [-clf.coef_[0], clf.coef_[0]]
print(coef)

[array([-3.09091937, -1.3586826 , -0.42282148]), array([3.09091937, 1.3586826 , 0.42282148])]


In [172]:
for ci, class_name in enumerate(clf.classes_):
    print('coefficients for %s' % class_name)
    display(pd.DataFrame([coef[ci]], columns=vec.get_feature_names()))

coefficients for bot


Unnamed: 0,contains_@,contains_URL,long_text
0,-3.090919,-1.358683,-0.422821


coefficients for human


Unnamed: 0,contains_@,contains_URL,long_text
0,3.090919,1.358683,0.422821


In [178]:
# sort coefficients by class.
features = vec.get_feature_names()
for ci, class_name in enumerate(clf.classes_):
    print('top features for class %5s' % class_name)
    for fi in coef[ci].argsort()[::-1]: # descending order.
        print('%20s\t%.2f' % (features[fi], coef[ci][fi]))

top features for class   bot
           long_text	-0.42
        contains_URL	-1.36
          contains_@	-3.09
top features for class human
          contains_@	3.09
        contains_URL	1.36
           long_text	0.42


In [182]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
for train, test in kf.split(X):
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    accuracies.append(accuracy_score(y[test], pred))
    
    
print('accuracy over all cross-validation folds: %s' % str(accuracies))
print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))

accuracy over all cross-validation folds: [0.7594398599649912, 0.7623155788947237, 0.7570642660665167, 0.7526881720430108, 0.7734150306364886]
mean=0.76 std=0.01
