In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sanskar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
df = pd.read_csv("book.csv")

In [6]:
df.shape

(4657, 4)

In [7]:
df

Unnamed: 0,index,title,genre,summary
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...
...,...,...,...,...
4652,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ..."
4653,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...
4654,4654,Red Rising,fantasy,"""I live for the dream that my children will be..."
4655,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ..."


In [9]:
df.isnull().sum()

index      0
title      0
genre      0
summary    0
dtype: int64

In [10]:
df['full'] = df['title'] + ' '+ df['summary']

In [13]:
print(df['full'])

0       Drowned Wednesday  Drowned Wednesday is the fi...
1       The Lost Hero  As the book opens, Jason awaken...
2       The Eyes of the Overworld  Cugel is easily per...
3       Magic's Promise  The book opens with Herald-Ma...
4       Taran Wanderer  Taran and Gurgi have returned ...
                              ...                        
4652    Hounded Atticus O’Sullivan, last of the Druids...
4653    Charlie and the Chocolate Factory Charlie Buck...
4654    Red Rising "I live for the dream that my child...
4655    Frostbite Rose loves Dimitri, Dimitri might lo...
4656    Radiance The Prince of no value\nBrishen Khask...
Name: full, Length: 4657, dtype: object


In [14]:
X = df.drop(columns='genre' , axis = 1)
Y  = df['genre']


In [16]:
port_stem = PorterStemmer()

In [17]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]' , ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [19]:
df['full'] = df['full'].apply(stemming)

In [21]:
print(df['full'])

0       drown wednesday drown wednesday first truste a...
1       lost hero book open jason awaken school bu una...
2       eye overworld cugel easili persuad merchant fi...
3       magic promis book open herald mage vanyel retu...
4       taran wander taran gurgi return caer dallben f...
                              ...                        
4652    hound atticu sullivan last druid live peac ari...
4653    charli chocol factori charli bucket wonder adv...
4654    red rise live dream children born free say lik...
4655    frostbit rose love dimitri dimitri might love ...
4656    radianc princ valu brishen khaskem princ kai l...
Name: full, Length: 4657, dtype: object


In [22]:
X = df['full'].values
Y = df['genre'].values

In [23]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y , test_size = 0.2, stratify = Y , random_state = 2)

In [37]:
model = LogisticRegression()

In [38]:
model.fit(X_train, Y_train)

In [39]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [40]:
print(training_data_accuracy)

0.9036241610738255


In [41]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print(test_data_accuracy)

0.6448497854077253
