## 1. Import libraries

In [20]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer 

from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.model_selection import train_test_split 


from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

nltk.download('gutenberg')
!python -m spacy download en

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Sunil\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [21]:
# Cleaning the data, removing --
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub('[\[].*?[\]]', '', text)
    text = re.sub(r'(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b', ' ', text)
    text = ' '.join(text.split())
    return text

In [22]:
# Loading the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# Dealing with the chapter indicator
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

# Parse the cleaned novels.
nlp = spacy.load('en_core_web_sm')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

# Groupping the parsed doc into sentences
alice_sents = [[sent, 'Carroll']for sent in alice_doc.sents]
persuasion_sents = [[sent, 'Austen']for sent in persuasion_doc.sents]

# Combining the sentences from two novel to one df
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns= ['text', 'author'])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [14]:
# Removing stopwords and punctuation and lemmatize the token
for i, sentence in enumerate(sentences['text']):
    sentences.loc[i, 'text'] = ' '. join(
    [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop])

## TF-IDF Vectorizer

In [15]:

vectorizer = TfidfVectorizer(max_df = 0.5, min_df= 2, use_idf= True, norm = u'l2', smooth_idf= True)

# applying the vectorizer
X = vectorizer.fit_transform(sentences['text'])
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([tfidf_df, sentences[['text', 'author']]], axis = 1)
sentences.head()

Unnamed: 0,abide,ability,able,abominate,abroad,absence,absent,absolute,absolutely,absurd,...,yer,yes,yesterday,yield,young,youth,zeal,zealous,text,author
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Alice begin tired sit sister bank have twice p...,Carroll
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,remarkable Alice think way hear Rabbit oh dear,Carroll
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,oh dear,Carroll
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,shall late,Carroll


## Modeling phase

In [7]:
# Predict the author in the sentences

Y = sentences['author']
X = np.array(sentences.drop(['text', 'author'], 1))

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.4, random_state = 44)

#Model
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print('----------------------Logistic Regression Scores----------------------')
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print('----------------------Random Forest Scores----------------------')
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print('----------------------Gradient Boosting Scores----------------------')
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))


----------------------Logistic Regression Scores----------------------
Training set score: 0.9163784973278843

Test set score: 0.8864278982092366
----------------------Random Forest Scores----------------------
Training set score: 0.981766740018862

Test set score: 0.8873704052780396
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8588494184218799

Test set score: 0.8539114043355325


## Creating TF-IDF  Vectors for the last three sentences.

Consider the following sentences:

1. "The best Monty Python sketch is the one about the dead parrot; I laughed so hard."
2. "I laugh when I think about Python's Ministry of Silly Walks sketch; it is funny, funny, funny, the best!"
3. "Chocolate is the best ice cream dessert topping, with a great taste."
4. "The Lumberjack Song is the funniest Monty Python bit; I can't think of it without laughing."
5. "I would rather put strawberries on my ice cream for dessert; they have the best taste."
6. "The taste of caramel is a fantastic accompaniment to tasty mint ice cream."

In [23]:
# using Last three sentences in the vectorizer
sentA = "The Lumberjack Song is the funniest Monty Python bit; I can't think of it without laughing."
sentB = "I would rather put strawberries on my ice cream for dessert; they have the best taste."
sentC = "The taste of caramel is a fantastic accompaniment to tasty mint ice cream."

In [24]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([sentA, sentB, sentC])
features = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
sent_df = pd.DataFrame(denselist, columns = features)

In [25]:
sent_df.head()

Unnamed: 0,accompaniment,best,bit,can,caramel,cream,dessert,fantastic,for,funniest,...,song,strawberries,taste,tasty,the,they,think,to,without,would
0,0.0,0.0,0.271642,0.271642,0.0,0.0,0.0,0.0,0.0,0.271642,...,0.271642,0.0,0.0,0.0,0.320872,0.0,0.271642,0.0,0.271642,0.0
1,0.0,0.276458,0.0,0.0,0.0,0.210254,0.276458,0.0,0.276458,0.0,...,0.0,0.276458,0.210254,0.0,0.163281,0.276458,0.0,0.0,0.0,0.276458
2,0.328961,0.0,0.0,0.0,0.328961,0.250183,0.0,0.328961,0.0,0.0,...,0.0,0.0,0.250183,0.328961,0.19429,0.0,0.0,0.328961,0.0,0.0
