See instructions here:

https://github.com/tapilab/elevate-osna-starter/blob/master/lessons/week2/README.md#day-2

In [2]:
from collections import Counter
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
def load_data(datafile, checkfile):
    """
    Read your data into a single pandas dataframe where
    - each row is an instance to be classified
    
    (this could be a tweet, user, or news article, depending on your project)
    - there is a column called `label` which stores the class label (e.g., the true
      category for this row)
    """
    df = pd.read_csv(datafile)[['social_id','comment_tokens','comment_time']]
    ck = pd.read_csv(checkfile)
    
    ck = ck.loc[ck['site'] == 'twitter', ['site', 'social_id', 'ruling_val']]
    
    
    ck['social_id'] = ck['social_id'].astype(df['social_id'].dtype)
    
    df.columns = ['id', 'text','time']
#     df = df.drop_duplicates(['id','text'])
    ck.columns = ['site','id','label']
    df = pd.merge(ck,df,on=['id'],how = 'inner')
    df['label'] = ['true' if i>0 else 'false' if i<0 else 'unknown' for i in df.label]
    df['comments_count'] = 1
    df['timemin'] = df['time']
    
    # combine multiple rows of an id into one row
    def ab(df):
        return' '.join(df.values)
    df = df.groupby(['id','label']).agg({'text':ab,'comments_count':sum,'time':max,'timemin':min})
    df['timeslot'] = df['time']-df['timemin']
    df['timepercomm'] = df['timeslot']/df['comments_count']
    df['timepercomm'] = [int(i) for i in df.timepercomm]
    df = df.drop(['time','timemin'],axis=1)
    df = df.reset_index()
    
    return df

df = load_data('..\\..\\training_data\\twitter.csv', '..\\..\\training_data\\factchecks.csv')
df.head()

Unnamed: 0,id,label,text,comments_count,timeslot,timepercomm
0,1972425520,true,rt @senjohnmccain : obama ha more czar than th...,7,78173527,11167646
1,2554608773,false,rt @thatkevinsmith : ten year in and we bone l...,87,1384707,15916
2,10288400197,true,rt @jimdemint : house appropriation chair davi...,13,63956,4919
3,15561382074,false,rt @markos : i get one for politifactref - - i...,2,73,36
4,16415781807,unknown,rt @presssec : who would the gop put in charge...,84,3116561,37101


In [39]:
# what is the distribution over class labels?
df.label.value_counts()

false      793
true       224
unknown    197
Name: label, dtype: int64

In [40]:
def make_features(df):
    vec = DictVectorizer()
    feature_dicts = []
    # just as an initial example, we will consider three
    # word features in the model.
    words_to_track = ['you', 'hate', 'love']
    # will get different model for different features.
    for i, row in df.iterrows():
        features = {}
        token_counts = Counter(re.sub('\W+', ' ', row['text'].lower()).split())
        for w in words_to_track:
            features[w] = token_counts[w]
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
    return X, vec
                
X, vec = make_features(df)

In [112]:
# X
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html

<1214x3 sparse matrix of type '<class 'numpy.float64'>'
	with 101 stored elements in Compressed Sparse Row format>

In [34]:
# Now, use CountVectorizer to create a term feature matrix.
# count_vec = CountVectorizer()
# X_words = count_vec.fit_transform(df.text)
# X_words.shape

(1214, 5519)

In [41]:
# how sparse is it? 
def print_sparsity(X_words):
    print('%d words' % X_words.shape[1])
    num_cells = X_words.shape[0] * X_words.shape[1]
    print('%d of %d possible cells are non-zero (%.2f%%)' %
          (X_words.nnz, num_cells,
           100 * X_words.nnz/num_cells))
# print_sparsity(X_words)

In [42]:
# how does sparsity vary with min_df?
for min_df in [1,2,5,10]:
    count_vec = CountVectorizer(min_df=min_df)
    print('\n\nmin_df=%d' % min_df)
    X_words = count_vec.fit_transform(df.text)
    print_sparsity(X_words)



min_df=1
5519 words
21101 of 6700066 possible cells are non-zero (0.31%)


min_df=2
1953 words
17535 of 2370942 possible cells are non-zero (0.74%)


min_df=5
653 words
14198 of 792742 possible cells are non-zero (1.79%)


min_df=10
289 words
11850 of 350846 possible cells are non-zero (3.38%)


In [43]:
# top terms?
def print_top_words(X_words, count_vec, n=10):
    features = count_vec.get_feature_names()
    word_counts = X_words.sum(axis=0).A1
    for i in np.argsort(word_counts)[::-1][:n]:
        print('%20s\t%d' % (features[i], word_counts[i]))

# print_top_words(X_words, count_vec)

In [54]:
# top terms using different ngrams
def fit_and_predict(min_df=2.0, max_df=1.0, ngram_range=(1,1)):
    count_vec = CountVectorizer(min_df=2, max_df=1, ngram_range=(1,1))
    print('\=%s' % str(ngram))
    X_words = count_vec.fit_transform(df.text)
    print_sparsity(X_words)
    print_top_words(X_words, count_vec)
    
    y = np.array(df.label)

    class_names = set(df.label)
    
    from scipy.sparse import csr_matrix, hstack
#     X_all = hstack([X, X_words])
    X_all = X_words
    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    clf.fit(X_all, y)
    
    coef = clf.coef_
    
    from sklearn.model_selection import KFold
    from sklearn.metrics import accuracy_score

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    for train, test in kf.split(X_all):
        clf.fit(X_all[train], y[train])
        pred = clf.predict(X_all[test])
        accuracies.append(accuracy_score(y[test], pred))


    print('accuracy over all cross-validation folds: %s' % str(accuracies))
    print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))

for ngram in [(1,1), (2,2), (3,3), (1,3)]:
    fit_and_predict(ngram_range=ngram)

\=(1, 1)


ValueError: max_df corresponds to < documents than min_df

In [16]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)

Counter({'false': 793, 'true': 224, 'unknown': 197})

In [17]:
# store the class names
class_names = set(df.label)

In [1]:
# how often does each word appear in each class?
# for word, idx in list(vec.vocabulary_.items())[:10]:
#     for class_name in class_names:
#         class_idx = np.where(y==class_name)[0]
#         print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

In [18]:
# to combine two sparse matrices:
from scipy.sparse import csr_matrix, hstack # "horizontal stack"
# m1 = csr_matrix([[1,2,3], [4,5,6]])
# print('m1')
# print(m1.todense())
# m2 = csr_matrix([[7,8], [9,10]])
# print('m2')
# print(m2.todense())
# hstack([m1, m2]).todense()

In [32]:
X_all = hstack([X, X_words])
X_all.shape
# X_all = X_words

In [22]:
# fit a LogisticRegression classifier.
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
clf.fit(X_all, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
# for binary classification, LogisticRegression stores a single coefficient vector
clf.coef_
# this would be a matrix for a multi-class probem.

array([[ 0.07953326, -0.0132968 , -0.03093894, ..., -0.00630561,
         0.02287026,  0.02287026],
       [ 0.04056652,  0.02974951,  0.04463372, ..., -0.02042553,
        -0.01176631, -0.01176631],
       [-0.12009979, -0.01645271, -0.01369478, ...,  0.02673114,
        -0.01110395, -0.01110395]])

In [24]:
clf.coef_.shape

(3, 4283)

In [25]:
# for binary classification, the coefficients for the negative class is just the negative of the positive class.
coef = clf.coef_
print(coef)

[[ 0.07953326 -0.0132968  -0.03093894 ... -0.00630561  0.02287026
   0.02287026]
 [ 0.04056652  0.02974951  0.04463372 ... -0.02042553 -0.01176631
  -0.01176631]
 [-0.12009979 -0.01645271 -0.01369478 ...  0.02673114 -0.01110395
  -0.01110395]]


In [26]:
clf.classes_

array(['false', 'true', 'unknown'], dtype=object)

In [29]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
for train, test in kf.split(X_all):
    clf.fit(X_all[train], y[train])
    pred = clf.predict(X_all[test])
    accuracies.append(accuracy_score(y[test], pred))
    
    
print('accuracy over all cross-validation folds: %s' % str(accuracies))
print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))

accuracy over all cross-validation folds: [0.6337448559670782, 0.654320987654321, 0.6008230452674898, 0.6172839506172839, 0.6033057851239669]
mean=0.62 std=0.02
