See instructions here:

https://github.com/tapilab/elevate-osna-starter/blob/master/lessons/week3/README.md#day-1

In [1]:
from collections import Counter
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def load_data(datafile):
    """
    Read your data into a single pandas dataframe where
    - each row is an instance to be classified
    (this could be a tweet, user, or news article, depending on your project)
    - there is a column called `label` which stores the class label (e.g., the true
      category for this row)
    """
    df = pd.read_csv(datafile)[['text', 'hostile']]
    df.columns = ['text', 'label']
    df['label'] = ['hostile' if i==1 else 'nonhostile' for i in df.label]
    return df

df = load_data('~/Dropbox/elevate/harassment/training_data/data.csv.gz')
df.head()

Unnamed: 0,text,label
0,@FlyGuyCree Nigga whatever one you gave me 🤦🏻‍♀️,nonhostile
1,@ArvindKejriwal . go to hell you ass hole.,hostile
2,@JohnJohnDaDon That “nigga” done lost his fuck...,hostile
3,@kane_tingle10 Can’t be fucked with them mate....,nonhostile
4,@JHarris_TheDon Its honestly better anyways. T...,nonhostile


In [3]:
# what is the distribution over class labels?
df.label.value_counts()

hostile       3588
nonhostile    3186
Name: label, dtype: int64

In [4]:
def make_features(df):
    vec = DictVectorizer()
    feature_dicts = []
    # just as an initial example, we will consider three
    # word features in the model.
    words_to_track = ['you', 'hate', 'love']
    # will get different model for different features.
    #words_to_track = ['you']
    for i, row in df.iterrows():
        features = {}
        token_counts = Counter(re.sub('\W+', ' ', row['text'].lower()).split())
        for w in words_to_track:
            features[w] = token_counts[w]
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
    return X, vec
                
X, vec = make_features(df)

In [5]:
X
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html

<6774x3 sparse matrix of type '<class 'numpy.float64'>'
	with 1933 stored elements in Compressed Sparse Row format>

In [38]:
# Now, use CountVectorizer to create a term feature matrix.
count_vec = CountVectorizer(min_df=1)
X_words = count_vec.fit_transform(df.text)
X_words.shape

(6774, 21018)

In [39]:
# top terms?
def print_top_words(X_words, count_vec, n=10):
    features = count_vec.get_feature_names()
    word_counts = X_words.sum(axis=0).A1
    for i in np.argsort(word_counts)[::-1][:n]:
        print('%20s\t%d' % (features[i], word_counts[i]))

print_top_words(X_words, count_vec)

                 the	2716
                 you	2622
               nigga	2340
                  to	1918
                 and	1691
                that	1641
                  it	1232
                  is	1221
                shit	1184
                  of	1091


In [30]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)

Counter({'nonhostile': 3186, 'hostile': 3588})

In [40]:
# store the class names
class_names = set(df.label)

In [42]:
X_all = hstack([X, X_words]).tocsr()
X_all.shape

(6774, 21021)

In [33]:
# fit a LogisticRegression classifier.
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
clf.fit(X_all, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
# for binary classification, the coefficients for the negative class is just the negative of the positive class.
coef = [-clf.coef_[0], clf.coef_[0]]
print(coef)

[array([-0.13452078, -0.68387522,  0.14541614, ...,  0.06153179,
        0.06153179,  0.06153179]), array([ 0.13452078,  0.68387522, -0.14541614, ..., -0.06153179,
       -0.06153179, -0.06153179])]


In [35]:
clf.classes_

array(['hostile', 'nonhostile'], dtype=object)

In [36]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
for train, test in kf.split(X):
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    accuracies.append(accuracy_score(y[test], pred))
    
    
print('accuracy over all cross-validation folds: %s' % str(accuracies))
print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))

accuracy over all cross-validation folds: [0.5424354243542435, 0.5476014760147602, 0.5276752767527675, 0.5402214022140222, 0.5236336779911374]
mean=0.54 std=0.01


In [18]:
features = np.array(vec.get_feature_names() + count_vec.get_feature_names())
len(features)

21021

In [20]:
preds = clf.predict(X)
preds[0]

'hostile'

In [57]:
# why was the first example labeled hostile?
for i in np.argsort(coef[0][X_all[0].nonzero()[1]])[::-1]:
    idx = X_all[0].nonzero()[1][i]
    print(features[idx])
    print(coef[0][idx])


nigga
0.19077783360160389
you
0.14541613571878462
you
0.14541613571878462
gave
-0.020392695987220318
flyguycree
-0.16244314389468142
me
-0.21987134662842517
whatever
-0.29767900360130367
one
-0.4465985350915529
