In [3]:
import pandas as pd
from fastai import *
from fastai.text import *
from fastai.utils.mem import GPUMemTrace #call with mtrace

In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

### prepare df

In [6]:
novels = pd.read_csv("/home/tessa/reading_age/projects/24_novels/chunks/scored.csv")
novels = novels.reset_index()
novels = novels[["Text","Level"]]

In [7]:
novels['Level'] = novels['Level'].apply(lambda x: 'Child' if x!= 'Adult' else 'Adult')

In [8]:
novels['is_valid'] = False

In [9]:
novels['is_valid'][::5] = True
novels

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Text,Level,is_valid
0,PREFACE Sir J. M. Barrie's delightful creation...,Adult,True
1,Then she turned the bedclothes neatly down and...,Adult,False
2,Darling told him about the weird apparition at...,Adult,False
3,"Suddenly the night-lights flickered, waned, an...",Adult,False
4,No wonder he was crying! But that was not the ...,Adult,False
...,...,...,...
5271,"George took the fun more soberly, and stuck to...",Adult,False
5272,Harris said he would introduce us both to the ...,Adult,False
5273,"After which, we managed to get some fitful slu...",Adult,False
5274,“Yes it’s almost a pity we’ve made up our mind...,Adult,False


### create labelist

In [10]:
novel_lablist = (TextList.from_df(novels,cols = 'Text')).split_from_df(col = 'is_valid').label_from_df(cols='Level')

In [12]:
def get_doc_term_matrix(text_list, n_terms):
    
    # inputs:
    #    text_list, a TextList object
    #    n_terms, the number of tokens in our IMDb vocabulary
    
    # output: 
    #    the CSR format sparse representation of the document-term matrix in the form of a
    #    scipy.sparse.csr.csr_matrix object

    
    # initialize arrays
    values = []
    column_indices = []
    row_pointer = []
    row_pointer.append(0)

    # from the TextList object
    for _, doc in enumerate(text_list):
        feature_counter = Counter(doc.data)
        column_indices.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        # Tack on N (number of nonzero elements in the matrix) to the end of the row_pointer array
        row_pointer.append(len(values))
        
    return scipy.sparse.csr_matrix((values, column_indices, row_pointer),
                                   shape=(len(row_pointer) - 1, n_terms),
                                   dtype=int)

In [13]:
%%time
train_doc_term = get_doc_term_matrix(novel_lablist.train.x, len(novel_lablist.vocab.itos))

CPU times: user 1.56 s, sys: 96 ms, total: 1.65 s
Wall time: 1.5 s


In [14]:
train_doc_term.shape

(4220, 19744)

In [15]:
valid_doc_term = get_doc_term_matrix(novel_lablist.valid.x, len(novel_lablist.vocab.itos))
valid_doc_term.shape

(1056, 19744)

In [16]:
novel_lablist.classes
novel_lablist.train.y.c2i

{'Adult': 0, 'Child': 1}

In [90]:
novel_lablist.train.y.classes[0]

'Adult'

### priors

In [17]:
# get the class priors for adult & child
child_prior = novel_lablist.train.y.items.mean()
adult_prior = 1 - child_prior

In [18]:
bias = np.log(adult_prior/child_prior)

In [19]:
bias

2.9444389791664403

### conditional probabilities

In [20]:
adult_rows = np.squeeze(np.argwhere(novel_lablist.y.items == novel_lablist.train.y.c2i['Adult']))
adult_rows

array([   0,    1,    2,    3, ..., 4216, 4217, 4218, 4219])

In [21]:
adult_count = train_doc_term[adult_rows].sum(axis = 0)
adult_count

matrix([[30913,     0,  4009,     0, ...,     3,     0,     0,     0]], dtype=int64)

In [22]:
adult_liklihood = (adult_count + 1)/(adult_count.sum() + 1)
adult_liklihood

matrix([[1.605778e-02, 5.194338e-07, 2.082930e-03, 5.194338e-07, ..., 2.077735e-06, 5.194338e-07, 5.194338e-07,
         5.194338e-07]])

In [23]:
child_rows = np.squeeze(np.argwhere(novel_lablist.y.items == novel_lablist.train.y.c2i['Child']))
len(child_rows)

211

In [24]:
child_count = train_doc_term[child_rows].sum(axis = 0)

In [25]:
child_count.shape

(1, 19744)

In [26]:
child_liklihood = (child_count + 1)/(child_count.sum() + 1)
child_liklihood

matrix([[1.536428e-03, 8.932719e-06, 1.893736e-03, 8.932719e-06, ..., 8.932719e-06, 8.932719e-06, 8.932719e-06,
         8.932719e-06]])

In [27]:
np.argpartition(child_liklihood, -5, axis = 1)[0,-5:]

matrix([[10, 11,  5, 71,  9]])

In [28]:
log_ratio = np.log(adult_liklihood/child_liklihood)
log_ratio

matrix([[ 2.346733, -2.844737,  0.095224, -2.844737, ..., -1.458442, -2.844737, -2.844737, -2.844737]])

In [29]:
np.asarray(np.argpartition(log_ratio,-10)[0,-10:])[0]

array([109, 106,  94,  65,  78,  51,  39,  41,  40,  36])

In [30]:
# largest args - should be most adulty
[novel_lablist.vocab.itos[i] for i in np.asarray((np.argpartition(log_ratio,-20)[0,-20:]))[0]]

[']',
 '[',
 'cosette',
 'bloom',
 'valjean',
 'mrs.',
 '(',
 'marius',
 'jean',
 ')',
 'mr.',
 '“',
 '‘',
 '”',
 '’s',
 'upon',
 'n’t',
 '–',
 '—',
 '’']

In [31]:
# smallest args should be most childy
[novel_lablist.vocab.itos[i] for i in np.asarray((np.argpartition(log_ratio,20)[0,:20]))[0]]

['neverland',
 'http',
 'onto',
 'silliness',
 'classroom',
 'english-e-books.net',
 'vampires',
 'unwise',
 'hypnotize',
 'okay',
 '\t',
 'lawrence',
 'anymore',
 'rented',
 'greetings',
 'nanny',
 'kilometres',
 "'",
 'cabdriver',
 'hythe']

### run model

#### naive bayes

In [32]:
# FULL
# greater than 0 is adult
train_prec = (np.squeeze(np.asarray((train_doc_term @ log_ratio.T + bias)>0)) == (novel_lablist.train.y.items == 0)).mean()
train_prec

0.9398104265402843

In [33]:
valid_prec = (np.squeeze(np.asarray((valid_doc_term @ log_ratio.T + bias)>0)) == (novel_lablist.valid.y.items == 0)).mean()
valid_prec

0.9365530303030303

In [34]:
# BINARIZED
train_prec = (np.squeeze(np.asarray((train_doc_term.sign() @ log_ratio.T + bias)>0)) == (novel_lablist.train.y.items == 0)).mean()
train_prec

0.9248815165876777

In [35]:
valid_prec = (np.squeeze(np.asarray((valid_doc_term.sign() @ log_ratio.T + bias)>0)) == (novel_lablist.valid.y.items == 0)).mean()
valid_prec

0.9204545454545454

#### logistic regression

In [36]:
# FULL
m = LogisticRegression(C=0.1, dual=False,solver = 'liblinear')
# 'liblinear' and 'newton-cg' solvers both get 0.88328 accuracy
# 'sag', 'saga', and 'lbfgs' don't converge
m.fit(train_doc_term, novel_lablist.train.y.items.astype(int))
preds = m.predict(valid_doc_term)
valid_accuracy = (preds==novel_lablist.valid.y.items).mean()
print(f'Validation accuracy is {valid_accuracy} using the full doc-term matrix')

Validation accuracy is 0.9962121212121212 using the full doc-term matrix


In [37]:
# BINARIZED
m = LogisticRegression(C=0.1, dual=False,solver = 'liblinear')
m.fit(train_doc_term.sign(), novel_lablist.train.y.items.astype(int))
preds = m.predict(valid_doc_term.sign())
valid_accuracy = (preds==novel_lablist.valid.y.items).mean()
print(f'Validation accuracy is {valid_accuracy} using the binarized doc-term matrix')

Validation accuracy is 0.9971590909090909 using the binarized doc-term matrix


### with trigrams

In [67]:
veczr = CountVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop, max_features=400000)

In [68]:
train_docs = novel_lablist.train.x
train_words = [[novel_lablist.vocab.itos[o] for o in doc.data] for doc in novel_lablist.train.x]


In [69]:
%%time
train_ngram_doc_matrix_veczr = veczr.fit_transform(train_words)
train_ngram_doc_matrix_veczr

CPU times: user 13.3 s, sys: 4.59 s, total: 17.9 s
Wall time: 18 s


<4220x400000 sparse matrix of type '<class 'numpy.int64'>'
	with 3362605 stored elements in Compressed Sparse Row format>

In [70]:
valid_docs = novel_lablist.valid.x
valid_words = [[novel_lablist.vocab.itos[o] for o in doc.data] for doc in novel_lablist.valid.x]


In [71]:
%%time
valid_ngram_doc_matrix_veczr = veczr.fit_transform(valid_words)
valid_ngram_doc_matrix_veczr

CPU times: user 3.34 s, sys: 140 ms, total: 3.48 s
Wall time: 3.48 s


<1056x400000 sparse matrix of type '<class 'numpy.int64'>'
	with 1016571 stored elements in Compressed Sparse Row format>

In [48]:
vocab = veczr.get_feature_names()

In [72]:
m = LogisticRegression(C=0.1, dual=False, solver = 'liblinear')
m.fit(train_ngram_doc_matrix_veczr, novel_lablist.y.items);

preds = m.predict(valid_ngram_doc_matrix_veczr)
accuracy =(preds==novel_lablist.valid.y.items).mean()
print(f'Accuracy  = {accuracy} for Logistic Regression, with full trigram counts from `CountVectorizer`' )

Accuracy  = 0.9081439393939394 for Logistic Regression, with full trigram counts from `CountVectorizer`


In [79]:
# fit model
m = LogisticRegression(C=0.1, dual=False, solver = 'liblinear')
m.fit(train_ngram_doc_matrix_veczr.sign(), novel_lablist.y.items);

# get predictions
preds = m.predict(valid_ngram_doc_matrix_veczr.sign())
valid_labels = [label == novel_lablist.valid.y.c2i['Child'] for label in novel_lablist.valid.y.items]

# check accuracy
accuracy = (preds==valid_labels).mean()
print(f'Accuracy = {accuracy} for Logistic Regression, with binarized trigram counts from `CountVectorizer`' )

Accuracy = 0.9583333333333334 for Logistic Regression, with binarized trigram counts from `CountVectorizer`


In [75]:
novel_lablist.valid.y.c2i

{'Adult': 0, 'Child': 1}

## doing everything

In [103]:
def get_bias_and_log_ratio(y_items, train_doc_term):
    # bias
    c1_prior = y_items.mean()
    bias = np.log(c1_prior/(1- c1_prior))
    
    # log ratio
    c1_rows = np.squeeze(np.argwhere(y_items == 1))
    c1_count = train_doc_term[c1_rows].sum(axis = 0)
    c1_liklihood = (c1_count + 1)/(c1_count.sum() + 1)
    
    c0_rows = np.squeeze(np.argwhere(y_items == 0))
    c0_count = train_doc_term[c0_rows].sum(axis = 0)
    c0_liklihood = (c0_count + 1)/(c0_count.sum() + 1)
    
    log_ratio = np.log(c1_liklihood/c0_liklihood)
    
    return bias, log_ratio
  

In [129]:
def classify(label_list):
    term_docs = []
    
    # CREATE TERM DOCS
    # class unigram
    train_doc_term = get_doc_term_matrix(label_list.train.x, len(label_list.vocab.itos))
    valid_doc_term = get_doc_term_matrix(label_list.valid.x, len(label_list.vocab.itos))
    term_docs.append(('classic_unigram',train_doc_term, valid_doc_term))
    
    # count vectorizer trigram
    veczr = CountVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop, max_features=400000)
    train_words = [[label_list.vocab.itos[o] for o in doc.data] for doc in label_list.train.x]
    train_ngram_doc_matrix_veczr = veczr.fit_transform(train_words)
    
    valid_words = [[label_list.vocab.itos[o] for o in doc.data] for doc in label_list.valid.x]
    valid_ngram_doc_matrix_veczr = veczr.fit_transform(valid_words)
    term_docs.append(('count_vectorizer_trigram', train_ngram_doc_matrix_veczr, valid_ngram_doc_matrix_veczr))

    
    df_list = []
    # RUN MODELS
    for term_doc_name, train_doc_term, valid_doc_term in term_docs:
        bias, log_ratio = get_bias_and_log_ratio(label_list.train.y.items, train_doc_term)
       
        m = LogisticRegression(C=0.1, dual=False,solver = 'liblinear')
        
        for i in range(2):
            if i == 0:
                count_style = 'full'
            else:
                count_style = 'binary'
                train_doc_term = train_doc_term.sign()
                valid_doc_term = valid_doc_term.sign()

            train_prec = (np.squeeze(np.asarray((train_doc_term @ log_ratio.T + bias)>0)) == label_list.train.y.items).mean()
            valid_prec = (np.squeeze(np.asarray((valid_doc_term @ log_ratio.T + bias)>0)) == label_list.valid.y.items).mean()
            df_list.append({'Model':'Bayes', 'Count_style':count_style, 'Term_doc':term_doc_name, 'Data_type':'Training','Result':train_prec})
            df_list.append({'Model':'Bayes', 'Count_style':count_style, 'Term_doc':term_doc_name, 'Data_type':'Validation','Result':valid_prec})


            m.fit(train_doc_term, label_list.train.y.items.astype(int))
            preds = m.predict(valid_doc_term)
            valid_accuracy = (preds==label_list.valid.y.items).mean()
            df_list.append({'Model':'Logistic regression', 'Count_style':count_style, 'Term_doc':term_doc_name, 'Data_type':'Validation','Result':valid_accuracy})

    results = pd.DataFrame(df_list)
    results = results.sort_values('Result')
        
    return results
    

classify(novel_lablist)

Unnamed: 0,Model,Count_style,Term_doc,Data_type,Result
7,Bayes,full,count_vectorizer_trigram,Validation,0.050189
10,Bayes,binary,count_vectorizer_trigram,Validation,0.050189
9,Bayes,binary,count_vectorizer_trigram,Training,0.123223
6,Bayes,full,count_vectorizer_trigram,Training,0.599052
8,Logistic regression,full,count_vectorizer_trigram,Validation,0.908144
4,Bayes,binary,classic_unigram,Validation,0.920455
3,Bayes,binary,classic_unigram,Training,0.924882
1,Bayes,full,classic_unigram,Validation,0.936553
0,Bayes,full,classic_unigram,Training,0.93981
11,Logistic regression,binary,count_vectorizer_trigram,Validation,0.958333


In [125]:
df_novs = pd.read_csv("/home/tessa/reading_age/projects/24_novels/chunks/scored.csv")
df_novs = df_novs.reset_index()[["Name","Text"]]
df_novs["Label"] = df_novs["Name"].apply(lambda x: 'Les_Mis' if x == 'Les_Miserables' else 'Not_les_mis')
df_novs['is_valid'] = False
df_novs['is_valid'][::5] = True
lesmis_lablist = (TextList.from_df(df_novs,cols = 'Text')).split_from_df(col = 'is_valid').label_from_df(cols='Label')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [130]:
# well seems to be pretty good at identifying les mis!
classify(lesmis_lablist)

Unnamed: 0,Model,Count_style,Term_doc,Data_type,Result
7,Bayes,full,count_vectorizer_trigram,Validation,0.294508
10,Bayes,binary,count_vectorizer_trigram,Validation,0.294508
8,Logistic regression,full,count_vectorizer_trigram,Validation,0.614583
11,Logistic regression,binary,count_vectorizer_trigram,Validation,0.679924
1,Bayes,full,classic_unigram,Validation,0.988636
6,Bayes,full,count_vectorizer_trigram,Training,0.993128
4,Bayes,binary,classic_unigram,Validation,0.995265
9,Bayes,binary,count_vectorizer_trigram,Training,0.995498
2,Logistic regression,full,classic_unigram,Validation,0.996212
5,Logistic regression,binary,classic_unigram,Validation,0.996212
