# 0. Prep

In [None]:
%pylab inline
plt.style.use('ggplot')

import pandas as pd
import pickle
import multiprocessing
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import pearsonr
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

text_path = '../CHICAGO_CORPUS/CHICAGO_NOVEL_CORPUS/'
meta_df = pd.read_csv('../CHICAGO_CORPUS/CHICAGO_NOVEL_CORPUS_METADATA/CHICAGO_CORPUS_NOVELS.csv')

### Corpus Sampling

In [None]:
# SAMPLE; RUN ONE TIME ONLY


sent_dexs, parg_dexs, novl_dexs, ctrl_dexs = [], [], [], []
for date in range(1901,2001):
    temp_df = meta_df[meta_df['PUBL_DATE']==date]
    dexs = temp_df.sample(16).index.tolist()
    sent_dexs += dexs[:4]
    parg_dexs += dexs[4:8]
    novl_dexs += dexs[8:12]
    ctrl_dexs += dexs[12:16]
sample_dexs = sent_dexs + parg_dexs + novl_dexs + ctrl_dexs
sample_dexs.sort()
subset = ['SENT' if dex in sent_dexs else 'PARG' if dex in parg_dexs else 'NOVL'\
          if dex in novl_dexs else 'CTRL' for dex in sample_dexs]
sample_df = meta_df.loc[sample_dexs]
sample_df['SUBSET'] = subset
sample_df.to_pickle('sample_df.pkl')

### Pre-Process

In [None]:
sample_df = pd.read_pickle('sample_df.pkl')

In [None]:
texts = []
for fname in sample_df['FILENAME']:
    with open(text_path+fname,'r') as file_in:
        texts.append(file_in.read())

In [None]:
segmented_texts = []

for i in range(len(texts)):
    print(i)
    this_text = texts[i]
    paragraph_list = this_text.split('\n\n')
    
    tokenized_paragraphs = []
    
    for j in range(len(paragraph_list)):
        this_paragraph = paragraph_list[j]
        sentence_list = sent_tokenize(this_paragraph)
        
        tokenized_sentences = []
        
        for k in range(len(sentence_list)):
            this_sentence = sentence_list[k]
            word_list = word_tokenize(this_sentence)
            
            tokenized_sentences.append(word_list)
        
        tokenized_paragraphs.append(tokenized_sentences)
    
    segmented_texts.append(tokenized_paragraphs)

In [None]:
# clear memory
del texts

In [None]:
initial, final = [], []
subset = list(sample_df['SUBSET'])

for i in range(len(subset)):
    if i%10 == 0:
        print(i)
    
    text = segmented_texts[i]
    
    text_tokens = []
    this_par_first_half, this_par_secnd_half = [], []
    this_sent_first_half, this_sent_secnd_half = [], []
    
    for paragraph in text:
        paragraph_tokens = []
        
        for sentence in paragraph:
            sentence_tokens = []
            
            for word in sentence:
                text_tokens.append(word)
                paragraph_tokens.append(word)
                sentence_tokens.append(word)
                
            this_sent_first_half += sentence_tokens[:len(sentence_tokens)//2]
            this_sent_secnd_half += sentence_tokens[len(sentence_tokens)//2:]
                
        this_par_first_half += paragraph_tokens[:len(paragraph_tokens)//2]
        this_par_secnd_half += paragraph_tokens[len(paragraph_tokens)//2:]
    
    if subset[i]=='NOVL':
        initial.append(" ".join(text_tokens[:len(text_tokens)//2]))
        final.append(" ".join(text_tokens[len(text_tokens)//2:]))
    
    elif subset[i]=='PARG':
        initial.append(" ".join(this_par_first_half))
        final.append(" ".join(this_par_secnd_half))
        
    elif subset[i]=='SENT':
        initial.append(" ".join(this_sent_first_half))
        final.append(" ".join(this_sent_secnd_half))
        
    elif subset[i]=='CTRL':
        rand_tokens = np.random.permutation(text_tokens)
        initial.append(" ".join(rand_tokens[:len(rand_tokens)//2]))
        final.append(" ".join(rand_tokens[len(rand_tokens)//2:]))
        
    else:
        print('broke',i)

In [None]:
# clear memory
del segmented_texts

In [None]:
sample_df['INITIAL'] = initial
sample_df['FINAL'] = final

In [None]:
# clear memory
del initial
del final

In [None]:
sample_df.to_pickle('sample_df_init_final.pkl')

# 1. Parameter Search: Ten-Fold Cross-Validation

In [None]:
sample_df = pd.read_pickle('sample_df_init_final.pkl')

### SENT-PARG-NOVL

In [None]:
def master_function_cv(percentile):

    test_auths = auth_list[k*len(auth_list)//10:(k+1)*len(auth_list)//10]
    train_auths = [auth for auth in auth_list if auth not in test_auths]

    train_df = spn_df[spn_df['AUTH_ID'].isin(train_auths)]
    test_df = spn_df[spn_df['AUTH_ID'].isin(test_auths)]

    train_labels = [0]*len(train_df)+[1]*len(train_df)
    test_labels = [0]*len(test_df)+[1]*len(test_df)

    tv = TfidfVectorizer(stop_words='english', max_features = num_feats, use_idf=False, norm='l1')
    dtm_train = tv.fit_transform(list(train_df['INITIAL'])+list(train_df['FINAL'])).toarray()
    dtm_test = tv.transform(list(test_df['INITIAL'])+list(test_df['FINAL'])).toarray()

    sc = StandardScaler()
    dtm_train_norm = sc.fit_transform(dtm_train)
    dtm_test_norm = sc.transform(dtm_test)

    lr = LogisticRegression(C=reg_coef)
    lr.fit(dtm_train_norm, train_labels)
    predictions = lr.predict(dtm_test_norm)
    
    return [predictions, test_labels]

In [None]:
pool = multiprocessing.Pool(10, maxtasksperchild=1)

In [None]:
spn_df = sample_df[sample_df['SUBSET'].isin(['SENT', 'PARG', 'NOVL'])]
auth_list = list(set(spn_df['AUTH_ID']))

num_list = [2000,3000,4000,5000]
coef_list = [1,0.1,0.01,0.001]

f1_array = np.empty([len(num_list),len(coef_list)])

for i in range(len(num_list)):
    for j in range(len(coef_list)):
        all_preds, all_labels = [], []
        auth_list = np.random.permutation(auth_list)
        num_feats = num_list[i]
        reg_coef = coef_list[j]
        percentiles = [x for x in range(10)]
        
        output = pool.map(master_function_cv, percentiles)
        
        for predictions, test_labels in output:

            all_preds += list(predictions)
            all_labels += test_labels

        f1_array[i][j] = f1_score(all_preds, all_labels, average='weighted')
        print(num_feats, reg_coef, f1_score(all_preds, all_labels, average='weighted'))

In [None]:
pool.close()
pool.terminate()
pool.join()

In [None]:
print(f1_array)

# 2. Leave-One-Out Predictions

### SENT-PARG-NOVL

In [None]:
def master_function_loocv(author):

    train_df = spn_df[spn_df['AUTH_ID']!=author]
    test_df = spn_df[spn_df['AUTH_ID']==author]

    train_labels = [0]*len(train_df)+[1]*len(train_df)
    test_labels = [0]*len(test_df)+[1]*len(test_df)

    tv = TfidfVectorizer(stop_words='english', max_features = num_feats, use_idf=False, norm='l1')
    dtm_train = tv.fit_transform(list(train_df['INITIAL'])+list(train_df['FINAL'])).toarray()
    dtm_test = tv.transform(list(test_df['INITIAL'])+list(test_df['FINAL'])).toarray()

    sc = StandardScaler()
    dtm_train_norm = sc.fit_transform(dtm_train)
    dtm_test_norm = sc.transform(dtm_test)

    lr = LogisticRegression(C=reg_coef)
    lr.fit(dtm_train_norm, train_labels)
    predictions = lr.predict(dtm_test_norm)
    
    return [predictions, test_df.index.tolist()]

In [None]:
spn_df = sample_df[sample_df['SUBSET'].isin(['SENT', 'PARG', 'NOVL'])]
auth_list = list(set(spn_df['AUTH_ID']))
all_outputs = []

num_feats = 3000
reg_coef = 0.001

for i in range(81):
    print(i)
    ten_auths = auth_list[i*10:(i+1)*10]
    
    pool = multiprocessing.Pool(10, maxtasksperchild=1)
    output = pool.map(master_function_loocv, ten_auths)
    
    pool.close()
    pool.terminate()
    pool.join()
    
    all_outputs += output

In [None]:
initial_preds, final_preds = [], []
dex_list = spn_df.index.tolist()

for predictions, dexs in all_outputs:
    for i in range(len(dexs)):
        this_dex = dexs[i]
        this_initial_pred = predictions[i]
        this_final_pred = predictions[i+len(dexs)]
        
        initial_preds.append([dex_list.index(this_dex),this_dex,this_initial_pred])
        final_preds.append([dex_list.index(this_dex),this_dex,this_final_pred])

initial_preds = sorted(initial_preds, key=lambda x: x[0], reverse=False)
final_preds = sorted(final_preds, key=lambda x: x[0], reverse=False)

initial_preds = [z for x,y,z in initial_preds]
final_preds = [z for x,y,z in final_preds]

In [None]:
spn_df['P(INIT)_INIT'] = initial_preds
spn_df['P(INIT)_FINAL'] = final_preds

In [None]:
f1_score(list(spn_df['P(INIT)_INIT'])+list(spn_df['P(INIT)_FINAL']),\
         [0]*len(spn_df)+[1]*len(spn_df), average='weighted')

In [None]:
for subset in ['SENT', 'PARG', 'NOVL']:
    sub_inits = list(spn_df[spn_df['SUBSET']==subset]['P(INIT)_INIT'])
    sub_finals = list(spn_df[spn_df['SUBSET']==subset]['P(INIT)_FINAL'])
    true_labels = [0]*len(sub_inits) + [1]*len(sub_finals)
    this_f1 = f1_score(sub_inits+sub_finals, true_labels, average='weighted')
    print(subset, this_f1)

In [None]:
these_columns = ['BOOK_ID', 'PUBL_DATE', 'SUBSET', 'P(INIT)_INIT', 'P(INIT)_FINAL']
score_df = spn_df[these_columns]
score_df.to_csv('spn_loocv_score.csv', index=False)

# 3. Full Model

In [None]:
spn_df = sample_df[sample_df['SUBSET'].isin(['SENT', 'PARG', 'NOVL'])]
labels = [0]*len(spn_df)+[1]*len(spn_df)

num_feats = 3000
reg_coef = 0.001

tv = TfidfVectorizer(stop_words='english', max_features = num_feats, use_idf=False, norm='l1')
dtm = tv.fit_transform(list(spn_df['INITIAL'])+list(spn_df['FINAL'])).toarray()
dtm_ctrl = tv.transform(list(sample_df[sample_df['SUBSET']=='CTRL']["INITIAL"])).toarray()

sc = StandardScaler()
dtm_norm = sc.fit_transform(dtm)
dtm_ctrl_norm = sc.transform(dtm_ctrl)

lr = LogisticRegression(C=reg_coef)
lr.fit(dtm_norm, labels)
predictions = lr.predict_proba(dtm_ctrl_norm)

In [None]:
min_columns = ['BOOK_ID', 'PUBL_DATE', 'SUBSET']
ctrl_score_df = sample_df[sample_df['SUBSET']=='CTRL'][min_columns]
ctrl_score_df['P(INIT)_CTRL'] = predictions[:,0]
ctrl_score_df.to_csv('ctrl_score.csv', index=False)

In [None]:
feature_weight_df = pd.DataFrame()
feature_weight_df['FEAT_NAME'] = tv.get_feature_names()
feature_weight_df['WEIGHT'] = lr.coef_[0]
feature_weight_df.to_csv('feature_weights.csv', index=False)

In [None]:
pickle.dump( [tv,sc,lr], open( 'tv_sc_lr.p', 'wb' ) )