In this lab, we will
- read our project data into a Pandas DataFrame
- write a function to compute simple features for each row of the data frame
- fit a LogisticRegression model to the data
- print the top coefficients
- compute measures of accuracy

I've given you starter code below. You should:
- First, try to get it to work with your data. It may require changing the load_data file to match the requirements of your data (e.g., what is the object you are classifying -- a tweet, a user, a news article?)
- Second, you should add additional features to the make_features function:
  - Be creative. It could be additional word features, or other meta data about the user, date, etc.
- As you try out different feature combinations, print out the coefficients and accuracy scores
- List any features that seem to improve accuracy. Why do you think that is?

In [131]:
from collections import Counter
import numpy as np
import pandas as pd
import re
import glob
import gzip
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack # "horizontal stack"

In [132]:
def load_data(datafile):
    """
    Read your data into a single pandas dataframe where
    - each row is an instance to be classified
    (this could be a tweet, user, or news article, depending on your project)
    - there is a column called `label` which stores the class label (e.g., the true
      category for this row)
    """
    bots = []
    humans = []
    folder = ['/bots', '/humans']
    name = '/*.json.gz'
    for f in folder:
        paths = glob.glob(datafile + f + name)
        for p in paths:
            with gzip.open(p, 'r') as file:
                for line in file:
                    if f == folder[0]:
                        bots.append(json.loads(line))
                    elif f == folder[1]:
                        humans.append(json.loads(line))
    df_bots = pd.DataFrame(bots)[['screen_name','tweets','listed_count']]
    df_bots['label'] = 'bot'
    df_humans = pd.DataFrame(humans)[['screen_name','tweets','listed_count']]
    df_humans['label'] = 'human'
    frames = [df_bots, df_humans]
    df = pd.concat(frames)
    users = bots + humans
    tweets = [u['tweets'] for u in users]
    text = [d['full_text'] for t in tweets for d in t] 

#     tweets_avg_len = []
    tweets_avg_mentions = []
    tweets_avg_urls = []
    factor = 100
    tweets_texts = []
    for u in users:
        tweets = u['tweets'] # a list of dicts
        texts = [t['full_text'] for t in tweets]
        tweets_texts.append(str(texts).strip('[]'))
#         avg_len = sum(map(len, texts))/len(texts)
#         tweets_avg_len.append(int(avg_len))
        count_mention = 0
        count_url = 0
        for s in texts:
            if 'http' in s:
                count_url+=1
            if '@' in s:
                count_mention+=1
        tweets_avg_urls.append(100 * count_url / len(texts))
        tweets_avg_mentions.append(100 * count_mention / len(texts))
#     df['tweets_avg_len'] = tweets_avg_len
    df['tweets_texts'] = tweets_texts
    df['tweets_avg_urls'] = tweets_avg_urls
    df['tweets_avg_mentions'] = tweets_avg_mentions
    return df
# df = load_data('~/Dropbox/elevate/harassment/training_data/data.csv.gz')
df = load_data('/Users/sheepman/Downloads/bots/small')
df


Unnamed: 0,screen_name,tweets,listed_count,label,tweets_texts,tweets_avg_urls,tweets_avg_mentions
0,carlos_eggbot,[{'created_at': 'Sat Jun 01 18:36:07 +0000 201...,0,bot,"'You heard me! Shoot me.', 'Junpei, you...', '...",10.500000,0.000000
1,ecolo_ebooks,[{'created_at': 'Sat Jun 01 18:36:11 +0000 201...,2,bot,"""i'm not straight but 20 bucks is 20 bu"", '""ec...",0.000000,0.000000
2,AllStarSMBot,[{'created_at': 'Sat Jun 01 18:36:28 +0000 201...,3,bot,"""You'll never know if you don't go\nYou'll nev...",0.000000,0.000000
3,saionji_en,[{'created_at': 'Sat Jun 01 18:36:52 +0000 201...,3,bot,"""why the fuck am i banana girl? i'll never die...",0.000000,0.000000
4,KimClune,[{'created_at': 'Sat Jun 01 18:37:20 +0000 201...,329,bot,'Chewing rather than drinking breakfast is AWE...,28.500000,2.500000
5,CatsDogsBOT,[{'created_at': 'Sat Jun 01 18:38:10 +0000 201...,3,bot,"'[Discussion] If I say no, that should be it. ...",100.000000,0.000000
6,bluejovanka,[{'created_at': 'Sat Jun 01 18:38:14 +0000 201...,47,bot,"""I'm staying in tonight watching someone with ...",37.688442,32.160804
7,anittavota4,[{'created_at': 'Sat Jun 01 18:39:19 +0000 201...,0,bot,'RT BrettHillOwens2: #PremiosMTVMIAW #MTVBRMUS...,0.000000,0.000000
8,justtraveluk,[{'created_at': 'Sat Jun 01 18:39:21 +0000 201...,11,bot,'The Top 5 Airports in the World for Departure...,100.000000,0.000000
9,rhaudiencebot,[{'created_at': 'Sat Jun 01 18:40:08 +0000 201...,0,bot,"'GET BUTCH, BITCH!', 'HEY RIFF, WHAT DO YOU DO...",0.000000,0.000000


In [133]:
# what is the distribution over class labels?
df.label.value_counts()
df.dtypes

screen_name             object
tweets                  object
listed_count             int64
label                   object
tweets_texts            object
tweets_avg_urls        float64
tweets_avg_mentions    float64
dtype: object

In [152]:
def make_features(df):
    vec = DictVectorizer()
    feature_dicts = []
    labels_to_track = ['tweets_avg_urls', 'tweets_avg_mentions','listed_count']
    for i, row in df.iterrows():
        features = {}
        features['tweets_avg_urls'] = row['tweets_avg_urls']
        features['tweets_avg_mentions'] = row['tweets_avg_mentions']
        features['listed_count'] = row['listed_count']
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
#     print(X)
    return X, vec

X, vec = make_features(df)


In [153]:
# what are dimensions of the feature matrix?
X.shape


(200, 3)

In [154]:
# what are the feature names?
# vocabulary_ is a dict from feature name to column index
vec.vocabulary_

{'tweets_avg_urls': 2, 'tweets_avg_mentions': 1, 'listed_count': 0}

In [155]:
# use CountVectorizer to create a term feature matrix
count_vec = CountVectorizer()
X_words = count_vec.fit_transform(df.tweets_texts)
X_words.shape

(200, 68224)

In [156]:
# how sparse is it? 
def print_sparsity(X_words):
    print('%d words' % X_words.shape[1])
    num_cells = X_words.shape[0] * X_words.shape[1]
    print('%d of %d possible cells are non-zero (%.2f%%)' %
          (X_words.nnz, num_cells,
           100 * X_words.nnz/num_cells))
print_sparsity(X_words)

68224 words
180988 of 13644800 possible cells are non-zero (1.33%)


In [157]:
# how does sparsity vary with min_df?
# 
X_words_list = []
for min_df in [1,2,5,10]:
    count_vec = CountVectorizer(min_df=min_df, max_df=1.0, ngram_range=(1,1))
    print('\n\nmin_df=%d' % min_df)
    X_words = count_vec.fit_transform(df.tweets_texts)
    X_words_list.append(X_words)
    print_sparsity(X_words)

for max_df in  [1.0, 0.95, 0.8]:
    count_vec = CountVectorizer(min_df=2, max_df=max_df, ngram_range=(1,1))
    print('\n\nmax_df=%f' % max_df)
    X_words = count_vec.fit_transform(df.tweets_texts)
    X_words_list.append(X_words)
    print_sparsity(X_words)

# terms using different ngrams
for ngram in [(1,1), (1,2), (1,3)]:
    count_vec = CountVectorizer(min_df=2, max_df=1.0, ngram_range=ngram)
    print('\n\nngram=', str(ngram))
    X_words = count_vec.fit_transform(df.tweets_texts)
    X_words_list.append(X_words)
    print_sparsity(X_words)





min_df=1
68224 words
180988 of 13644800 possible cells are non-zero (1.33%)


min_df=2
14528 words
127292 of 2905600 possible cells are non-zero (4.38%)


min_df=5
5708 words
104659 of 1141600 possible cells are non-zero (9.17%)


min_df=10
2988 words
86980 of 597600 possible cells are non-zero (14.55%)


max_df=1.000000
14528 words
127292 of 2905600 possible cells are non-zero (4.38%)


max_df=0.950000
14528 words
127292 of 2905600 possible cells are non-zero (4.38%)


max_df=0.800000
14514 words
124925 of 2902800 possible cells are non-zero (4.30%)


ngram= (1, 1)
14528 words
127292 of 2905600 possible cells are non-zero (4.38%)


ngram= (1, 2)
47248 words
261312 of 9449600 possible cells are non-zero (2.77%)


ngram= (1, 3)
65630 words
312700 of 13126000 possible cells are non-zero (2.38%)


In [158]:
# how often does each word occur?
for word, idx in vec.vocabulary_.items():
    print('%20s\t%d' % (word, X[:,idx].sum()))

     tweets_avg_urls	9149
 tweets_avg_mentions	7028
        listed_count	147695


In [159]:
# can also get a simple list of feature names:
vec.get_feature_names()

# e.g., first column is 'hate', second is 'love', etc.

['listed_count', 'tweets_avg_mentions', 'tweets_avg_urls']

In [160]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)

Counter({'bot': 100, 'human': 100})

In [161]:
# store the class names
class_names = set(df.label)

In [162]:
# how often does each word appear in each class?
for word, idx in vec.vocabulary_.items():
    for class_name in class_names:
        class_idx = np.where(y==class_name)[0]
        print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

     tweets_avg_urls	               human	5491
     tweets_avg_urls	                 bot	3657
 tweets_avg_mentions	               human	6123
 tweets_avg_mentions	                 bot	905
        listed_count	               human	145344
        listed_count	                 bot	2351


So, `you` appears more frequently in positive (hostile) class, and `love` appears more frequently in the negative (non-hostile) class.

In [163]:
terms = [('min_dif', 1),('min_dif', 2),('min_dif', 5),('min_dif', 10),
        ('max_dif', 1.00),('max_dif', 0.95),('max_dif', 0.80),
        ('ngram_range', '(1,1)'), ('ngram_range', '(1,2)'),('ngram_range', '(1,3)')]
# X_all_list = [X_words for X_words in all_X_words]
for i in range(len(X_words_list)):
    print(terms[i][0], '=', terms[i][1])
    print(np.shape(X_words_list[i]))


min_dif = 1
(200, 68224)
min_dif = 2
(200, 14528)
min_dif = 5
(200, 5708)
min_dif = 10
(200, 2988)
max_dif = 1.0
(200, 14528)
max_dif = 0.95
(200, 14528)
max_dif = 0.8
(200, 14514)
ngram_range = (1,1)
(200, 14528)
ngram_range = (1,2)
(200, 47248)
ngram_range = (1,3)
(200, 65630)


In [242]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
clf = LogisticRegression(C=1, penalty='l2')
mean = []
std = []
arr_accuracies = []
for idx in range(len(X_words_list)):
    clf.fit(X_words_list[idx], y)
#     print(clf.coef_)
    coef = [-clf.coef_[0], clf.coef_[0]]
#     print(coef)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    X = X_words_list[idx]
    for train, test in kf.split(X):
        clf.fit(X[train], y[train])
        pred = clf.predict(X[test])
        accuracies.append(accuracy_score(y[test], pred))
    arr_accuracies.append(str(accuracies))
    mean.append(np.mean(accuracies))
    std.append(np.std(accuracies))

In [231]:
print('min_dif', '%10s'%'accuracy')
for i in range(0,4):
    print('%5s%10.2f'%(terms[i][1], mean[i]))
print('max_dif', '%10s'%'accuracy')
for i in range(4,7):
    print('%5s%10.2f'%(terms[i][1], mean[i])) 
print('ngram', '%10s'%'accuracy')
for i in range(7,10):
    print('%5s%10.2f'%(terms[i][1], mean[i])) 

min_dif   accuracy
    1      0.77
    2      0.76
    5      0.78
   10      0.77
max_dif   accuracy
  1.0      0.76
 0.95      0.76
  0.8      0.78
ngram   accuracy
(1,1)      0.76
(1,2)      0.78
(1,3)      0.76


In [243]:
C=[0.1, 1, 5, 10]
p=['l1','l2']
clf1 = LogisticRegression(C=C[0], penalty='l2')
clf2 = LogisticRegression(C=C[1], penalty='l2')
clf3 = LogisticRegression(C=C[2], penalty='l2')
clf4 = LogisticRegression(C=C[3], penalty='l2')
clf5 = LogisticRegression(C=1, penalty=p[0])
clf6 = LogisticRegression(C=1, penalty=p[1])

clf_list = [clf1, clf2, clf3, clf4, clf5, clf6]
X = X_words_list[1]
mean = []
std = []
arr_accuracies = []

for clf in clf_list:
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    for train, test in kf.split(X):
        clf.fit(X[train], y[train])
        pred = clf.predict(X[test])
        accuracies.append(accuracy_score(y[test], pred))
    arr_accuracies.append(str(accuracies))
    mean.append(np.mean(accuracies))
    std.append(np.std(accuracies))

print('%5s'%'C', '%12s'%'accuracy')
for i in range(0,4):
    print('%5s%10.2f'%(C[i], mean[i]))
print('penalty', '%10s'%'accuracy')
for i in range(0,2):
    print('%5s%10.2f'%(p[i], mean[i])) 


    C     accuracy
  0.1      0.75
    1      0.76
    5      0.75
   10      0.75
penalty   accuracy
   l1      0.75
   l2      0.76
