In [1]:
import os
import nltk
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
path = 'Author Corpus Mini Larger'
# output_path = path+' processed'
trigger_file_path = 'triggers.txt'

In [3]:
def load_paragraph_triggers(trigger_file_path):
    file  = open(trigger_file_path, "r")
    triggers = file.read().split('\n')
    file.close()
    return triggers

In [4]:
def process_paragraphs(paragraphs):
    extras=['\n', '\t']
    triggers = load_paragraph_triggers(trigger_file_path)
    resultant_paragraphs = []
    for sentence in paragraphs:
        trigger_flag=False
        for trigger in triggers:
            sentence=sentence.strip('\n')
            if sentence[:len(trigger)]==trigger:
                trigger_flag=True
                break
        if sentence and not trigger_flag:
            sentence = sentence.replace('\n', ' ')
            resultant_paragraphs.append(sentence)
    return resultant_paragraphs

In [5]:
def generate_text(paragraphs):
    result = ""
    for paragraph in paragraphs:
        result+=paragraph+'\n'
    return result

In [6]:
ID=0
CLASS=0
IDS=[]
labels=[]
sentences=[]
processed_sentence_by_id=dict()
author_code_by_id=dict()
# if not os.path.exists(output_path):
#     os.mkdir(output_path)
for folder in os.listdir(path):
    folder_path = path+'\\'+folder
    print(folder_path)
#     destination_folder_path = output_path+'\\'+folder
#     if not os.path.exists(destination_folder_path):
#         os.mkdir(destination_folder_path)
    
    # Generating Author ID
    author_id = ''
    for name in folder.split(' '):
        author_id+=name[0]
    author_code_by_id[CLASS]=author_id
    
    for filename in os.listdir(folder_path):
        print(filename)
        cur_path = folder_path+"\\"+filename
        file  = open(cur_path, "r", encoding='utf-8')
        text = file.read()
        file.close()
        
        paragraphs = text.split('.')
        paragraphs = process_paragraphs(paragraphs)      
        
        IDS.extend(np.arange(ID, ID+len(paragraphs)).tolist())
        sentences.extend(paragraphs)
        labels.extend([CLASS]*len(paragraphs))        
        
        for sentence in paragraphs:
            processed_sentence_by_id[ID]=sentence
            ID+=1

#         file = open(destination_folder_path+"\\"+filename, "x", encoding='utf-8')
#         file.write(preprocessed_text)
#         file.close()
    CLASS+=1

Author Corpus Mini Larger\Shakespeare, William
a midsummer nights dream
all the worlds a stage
alls well that ends well
as you like it
comedy of errors
hamlet
henry viii
king henry v
king lear
loves labours lost
macbeth
othello
romeo and juliet
song of the witches macbeth
sonnet 1
sonnet 10
sonnet 100
sonnet 101
sonnet 102
sonnet 103
sonnet 104
sonnet 105
sonnet 106
sonnet 107
sonnet 108
sonnet 109
sonnet 11
sonnet 110
sonnet 111
sonnet 112
sonnet 113
sonnet 114
sonnet 115
sonnet 116
sonnet 117
sonnet 118
sonnet 119
sonnet 12
sonnet 120
sonnet 121
sonnet 122
sonnet 123
sonnet 124
sonnet 125
sonnet 126
sonnet 127
sonnet 128
sonnet 129
sonnet 13
sonnet 130
sonnet 131
sonnet 132
sonnet 133
sonnet 134
sonnet 135
sonnet 136
sonnet 137
sonnet 138
sonnet 139
sonnet 14
sonnet 140
sonnet 141
sonnet 142
sonnet 143
sonnet 144
sonnet 145
sonnet 146
sonnet 147
sonnet 148
sonnet 149
sonnet 15
sonnet 150
sonnet 151
sonnet 152
sonnet 153
sonnet 154
sonnet 16
sonnet 17
sonnet 18
sonnet 19
sonnet 2
sonn

In [7]:
df = {'id':IDS, 'sentence':sentences, 'label':labels}
df = pd.DataFrame.from_dict(df)
original_df = df.copy()
df

Unnamed: 0,id,sentence,label
0,0,"Dramatis Personae THESEUS, Duke of Athens E...",0
1,1,Four days will quickly steep themselves in nig...,0
2,2,"Go, Philostrate, Stir up the Athenian youth to...",0
3,3,"Exit PHILOSTRATEHippolyta, I woo'd thee with m...",0
4,4,"Enter EGEUS, and his daughter HERMIA, LYSANDER...",0
...,...,...,...
63386,63386,"""What? Pepper beaten at last? I congratulate y...",2
63387,63387,Paley to bed,2
63388,63388,All these voices sounded gratefully in St,2
63389,63389,"John's ears as he lay half-asleep, and yet vi...",2


In [8]:
df['sentence'] = df.sentence.map(lambda x: x.lower())
df['sentence'] = df.sentence.str.replace('[^\w\s]', '')
df['sentence'] = df['sentence'].apply(nltk.word_tokenize)

In [9]:
df['sentence'] = df['sentence'].apply(lambda x: ' '.join(x))

count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['sentence'])
# transformer = TfidfTransformer().fit(counts)
# counts = transformer.transform(counts)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69)
model = MultinomialNB().fit(X_train, y_train)
predicted = model.predict(X_test)
print('Accuracy:', accuracy_score(predicted, y_test))

Accuracy: 0.8596214511041009


In [11]:
model = MultinomialNB().fit(counts, df['label'])
sentence_probs = model.predict_proba(counts)
sentence_probs.shape

(63391, 3)

Finding out the probability confidence, (out of 1), the classifier had for the true class label as compared to the others

In [12]:
true_class_probability=[]
for itr in range(df.shape[0]):
    label=df.iloc[itr]['label']
    true_class_probability.append(sentence_probs[itr][label])
df['confidence']=true_class_probability
df

Unnamed: 0,id,sentence,label,confidence
0,0,dramatis personae theseus duke of athens egeus...,0,1.000000
1,1,four days will quickly steep themselves in nig...,0,0.999977
2,2,go philostrate stir up the athenian youth to m...,0,1.000000
3,3,exit philostratehippolyta i wood thee with my ...,0,1.000000
4,4,enter egeus and his daughter hermia lysander a...,0,1.000000
...,...,...,...,...
63386,63386,what pepper beaten at last i congratulate you ...,2,0.999656
63387,63387,paley to bed,2,0.965494
63388,63388,all these voices sounded gratefully in st,2,0.989200
63389,63389,johns ears as he lay halfasleep and yet vividl...,2,0.931691


Building Dataset

In [13]:
threshold_length = 5
author_num_statement_limit=0.9
author_common_statement_ratio=0.7
statement_identifiers=[]
author_statements=[]
author_labels=[]
for CLASS_ID in range(CLASS):
    CLASS_DF = df[df['label']==CLASS_ID]
    print(CLASS_ID, CLASS_DF.shape)
    CLASS_DF = CLASS_DF.sort_values(by=['confidence'], ascending=True)

    total_sentences=CLASS_DF.shape[0]
    total_sentences_selected=int(total_sentences*author_num_statement_limit)
    
    common_sentence_count=int(total_sentences_selected*author_common_statement_ratio)
    unique_sentence_count=total_sentences_selected-common_sentence_count   
    
    UNIQUE = CLASS_DF.tail(unique_sentence_count)
    COMMON = CLASS_DF.head(common_sentence_count)
    
    CLASS_DF = pd.concat([UNIQUE, COMMON]).sample(frac=1).reset_index(drop=True)
    print(CLASS_ID, CLASS_DF.shape)
    print()

    author_id = author_code_by_id[CLASS_ID]
    for row in range(CLASS_DF.shape[0]):
        sentence_id = CLASS_DF.iloc[row]['id']
        sentence = processed_sentence_by_id[sentence_id]
        if len(sentence.split(' '))>threshold_length:
            statement_identifiers.append(sentence_id)
            author_statements.append(sentence)
            author_labels.append(author_id)

0 (23749, 4)
0 (21374, 4)

1 (18898, 4)
1 (17008, 4)

2 (20744, 4)
2 (18669, 4)



In [14]:
dataset = {'id':statement_identifiers, 'text':author_statements, 'author':author_labels}
dataset = pd.DataFrame.from_dict(dataset)
dataset

Unnamed: 0,id,text,author
0,77,"Enter EGEUS, and his daughter HERMIA, LYSANDER...",SW
1,6481,"After the Duke his Father, with the knife He ...",SW
2,8926,"Now by Apollo- Now by Apollo, King, Thou swear...",SW
3,20382,"But thus, I trust, you will not marry her",SW
4,4184,"My lord, I came to see your father's funeral",SW
...,...,...,...
45393,43227,Look how he bends as they reach the gateway,WV
45394,58592,She was laughing at Miss Allan,WV
45395,52368,And yet she liked him,WV
45396,44849,Then Florinda laid her hand upon his knee,WV


In [15]:
dataset = dataset.sample(frac=1).reset_index(drop=True)

y = dataset['author']
X = dataset.drop(['author'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
X_train['author']=y_train
X_test['author']=y_test

print(X_train.shape)
print(X_test.shape)

X_train.to_excel('train.xlsx', index=False, encoding='utf-8-sig')
X_test.to_excel('test.xlsx', index=False, encoding='utf-8-sig')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(36318, 3)
(9080, 3)


#### Old Code

Finding word, class and word given class probabilities

In [16]:
# total_words=0
# word_probability=dict()
# class_probability=[0]*df.label.nunique()
# for row in range(df.shape[0]):
#     sentence, label = df.iloc[row]['sentence'], df.iloc[row]['label']
#     sentence = nltk.word_tokenize(sentence)
#     class_probability[int(label)]+=len(sentence)
#     for word in sentence:
#         if word in count_vect.vocabulary_:
#             if word not in word_probability:
#                 word_probability[word]=1
#             else:
#                 word_probability[word]+=1
#             total_words+=1

# for word in word_probability:
#     word_probability[word]/=total_words
# for itr, cls in enumerate(class_probability):
#     class_probability[itr]/=total_words
# print(class_probability)    

In [17]:
# p_word_given_class = dict()
# p_class_given_word = dict()

# for key, probs in zip(count_vect.vocabulary_, model.feature_log_prob_.T):
#     # converting log probability to probability
#     p_word_given_class[key]=np.exp(probs)
#     p_class_given_word[key]=[]
    
#     for label, p_class in enumerate(class_probability):
#         p_class_given_word[key].append((p_word_given_class[key][label]*p_class)/word_probability[key])

# for key in p_word_given_class:
#     print(p_word_given_class[key])
#     print(p_class_given_word[key])
#     break

In [18]:
# p_word_given_class = dict()
# p_class_given_word = dict()

# for key, probs in zip(count_vect.vocabulary_, model.coef_.T):
#     # converting log probability to probability
#     p_word_given_class[key]=np.exp(probs)
#     p_class_given_word[key]=[]
    
#     for label, p_class in enumerate(class_probability):
#         p_class_given_word[key].append((p_word_given_class[key][label]*p_class)/word_probability[key])

# for key in p_word_given_class:
#     print(p_word_given_class[key])
#     print(p_class_given_word[key])
#     break