See instructions here:

https://github.com/tapilab/elevate-osna-starter/blob/master/lessons/week2/README.md#day-2

In [9]:
from collections import Counter
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def load_data(datafile):
    """
    Read your data into a single pandas dataframe where
    - each row is an instance to be classified
    (this could be a tweet, user, or news article, depending on your project)
    - there is a column called `label` which stores the class label (e.g., the true
      category for this row)
    """
    df = pd.read_csv(datafile)[['text', 'hostile']]
    df.columns = ['text', 'label']
    df['label'] = ['hostile' if i==1 else 'nonhostile' for i in df.label]
    return df

df = load_data('~/Dropbox/elevate/harassment/training_data/data.csv.gz')
df.head()

Unnamed: 0,text,label
0,@FlyGuyCree Nigga whatever one you gave me 🤦🏻‍♀️,nonhostile
1,@ArvindKejriwal . go to hell you ass hole.,hostile
2,@JohnJohnDaDon That “nigga” done lost his fuck...,hostile
3,@kane_tingle10 Can’t be fucked with them mate....,nonhostile
4,@JHarris_TheDon Its honestly better anyways. T...,nonhostile


In [4]:
# what is the distribution over class labels?
df.label.value_counts()

hostile       3588
nonhostile    3186
Name: label, dtype: int64

In [5]:
def make_features(df):
    vec = DictVectorizer()
    feature_dicts = []
    # just as an initial example, we will consider three
    # word features in the model.
    words_to_track = ['you', 'hate', 'love']
    # will get different model for different features.
    #words_to_track = ['you']
    for i, row in df.iterrows():
        features = {}
        token_counts = Counter(re.sub('\W+', ' ', row['text'].lower()).split())
        for w in words_to_track:
            features[w] = token_counts[w]
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
    return X, vec
                
X, vec = make_features(df)

In [6]:
X
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html

<6774x3 sparse matrix of type '<class 'numpy.float64'>'
	with 1933 stored elements in Compressed Sparse Row format>

In [52]:
# Now, use CountVectorizer to create a term feature matrix.
count_vec = CountVectorizer()
X_words = count_vec.fit_transform(df.text)
X_words.shape

(6774, 21018)

In [53]:
# how sparse is it? 
def print_sparsity(X_words):
    print('%d words' % X_words.shape[1])
    num_cells = X_words.shape[0] * X_words.shape[1]
    print('%d of %d possible cells are non-zero (%.2f%%)' %
          (X_words.nnz, num_cells,
           100 * X_words.nnz/num_cells))
print_sparsity(X_words)

21018 words
99153 of 142375932 possible cells are non-zero (0.07%)


In [54]:
# how does sparsity vary with min_df?
for min_df in [1,2,5,10]:
    count_vec = CountVectorizer(min_df=min_df)
    print('\n\nmin_df=%d' % min_df)
    X_words = count_vec.fit_transform(df.text)
    print_sparsity(X_words)



min_df=1
21018 words
99153 of 142375932 possible cells are non-zero (0.07%)


min_df=2
5120 words
83255 of 34682880 possible cells are non-zero (0.24%)


min_df=5
1866 words
74977 of 12640284 possible cells are non-zero (0.59%)


min_df=10
951 words
69038 of 6442074 possible cells are non-zero (1.07%)


In [55]:
# top terms?
def print_top_words(X_words, count_vec, n=10):
    features = count_vec.get_feature_names()
    word_counts = X_words.sum(axis=0).A1
    for i in np.argsort(word_counts)[::-1][:n]:
        print('%20s\t%d' % (features[i], word_counts[i]))

print_top_words(X_words, count_vec)

                 the	2716
                 you	2622
               nigga	2340
                  to	1918
                 and	1691
                that	1641
                  it	1232
                  is	1221
                shit	1184
                  of	1091


In [56]:
# top terms using different ngrams
for ngram in [(1,1), (2,2), (3,3), (1,3)]:
    count_vec = CountVectorizer(min_df=2, ngram_range=ngram)
    print('\nngram_range=%s' % str(ngram))
    X_words = count_vec.fit_transform(df.text)
    print_sparsity(X_words)
    print_top_words(X_words, count_vec)


ngram_range=(1, 1)
5120 words
83255 of 34682880 possible cells are non-zero (0.24%)
                 the	2716
                 you	2622
               nigga	2340
                  to	1918
                 and	1691
                that	1641
                  it	1232
                  is	1221
                shit	1184
                  of	1091

ngram_range=(2, 2)
9104 words
41935 of 61670496 possible cells are non-zero (0.07%)
            https co	355
              in the	216
              you re	214
          that nigga	195
            my nigga	190
          this nigga	174
               to be	154
            the fuck	151
              of the	131
           that shit	120

ngram_range=(3, 3)
3080 words
8252 of 20863920 possible cells are non-zero (0.04%)
       what the fuck	51
      nigga https co	28
       piece of shit	27
         the fuck up	26
       shut the fuck	23
         the fuck is	21
        fuck off you	19
       is wrong with	17
     bitch ass nigga	15
       the fact that

In [16]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)

Counter({'nonhostile': 3186, 'hostile': 3588})

In [15]:
# store the class names
class_names = set(df.label)

In [17]:
# how often does each word appear in each class?
for word, idx in list(vec.vocabulary_.items())[:10]:
    for class_name in class_names:
        class_idx = np.where(y==class_name)[0]
        print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

                 you	             hostile	1690
                 you	          nonhostile	932
                hate	             hostile	20
                hate	          nonhostile	24
                love	             hostile	44
                love	          nonhostile	85


In [44]:
# to combine two sparse matrices:
from scipy.sparse import csr_matrix, hstack # "horizontal stack"
m1 = csr_matrix([[1,2,3], [4,5,6]])
print('m1')
print(m1.todense())
m2 = csr_matrix([[7,8], [9,10]])
print('m2')
print(m2.todense())
hstack([m1, m2]).todense()

m1
[[1 2 3]
 [4 5 6]]
m2
[[ 7  8]
 [ 9 10]]


matrix([[ 1,  2,  3,  7,  8],
        [ 4,  5,  6,  9, 10]], dtype=int64)

In [45]:
X_all = hstack([X, X_words])
X_all.shape

(6774, 17307)

In [46]:
# fit a LogisticRegression classifier.
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
clf.fit(X_all, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [47]:
# for binary classification, LogisticRegression stores a single coefficient vector
clf.coef_
# this would be a matrix for a multi-class probem.

array([[ 0.09542003,  0.52383939, -0.1823916 , ...,  0.42041654,
         0.10496036, -0.10404671]])

In [48]:
# for binary classification, the coefficients for the negative class is just the negative of the positive class.
coef = [-clf.coef_[0], clf.coef_[0]]
print(coef)

[array([-0.09542003, -0.52383939,  0.1823916 , ..., -0.42041654,
       -0.10496036,  0.10404671]), array([ 0.09542003,  0.52383939, -0.1823916 , ...,  0.42041654,
        0.10496036, -0.10404671])]


In [49]:
clf.classes_

array(['hostile', 'nonhostile'], dtype=object)

In [51]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
for train, test in kf.split(X):
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    accuracies.append(accuracy_score(y[test], pred))
    
    
print('accuracy over all cross-validation folds: %s' % str(accuracies))
print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))

accuracy over all cross-validation folds: [0.5424354243542435, 0.5476014760147602, 0.5276752767527675, 0.5402214022140222, 0.5236336779911374]
mean=0.54 std=0.01
