In [85]:
import pandas as pd
import numpy as np
import re
import nltk
import string
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

#nltk.download("all")

In [None]:
df = pd.read_csv("data.csv")

In [54]:
df.isnull().sum()

index      0
title      0
genre      0
summary    0
dtype: int64

In [55]:
stop_words_list = stopwords.words("english")

In [56]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"\'", ' ', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [word for word in words if not word in stop_words_list]
    words = [word.strip() for word in words if len(word.strip()) > 1]
    text = " ".join(words)
    return text

In [57]:
df["summary"] = df["summary"].apply(preprocess)

In [59]:
def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    stemmed_text = ""
    for word in text.split():
        stem = lemmatizer.lemmatize(word)
        stemmed_text += stem
        stemmed_text += " "
        
    stemmed_text = stemmed_text.strip()
    return stemmed_text

In [63]:
df["summary"] = df["summary"].apply(lemmatizing)

In [69]:
def stemming(text):
    stemmed_text = ""
    stemmer = PorterStemmer()
    for word in text.split():
        stem = stemmer.stem(word)
        stemmed_text += stem
        stemmed_text += " "
        
    stemmed_text = stemmed_text.strip()
    return stemmed_text

In [70]:
df["summary"] = df["summary"].apply(stemming)

In [73]:
le = LabelEncoder().fit(df["genre"])
le_nm = dict(zip(le.classes_, le.transform(le.classes_)))
df["genre"] = df["genre"].apply(lambda x: le_nm[x])
le_nm

{'crime': 0,
 'fantasy': 1,
 'history': 2,
 'horror': 3,
 'psychology': 4,
 'romance': 5,
 'science': 6,
 'sports': 7,
 'thriller': 8,
 'travel': 9}

In [82]:
count_vectorizer = CountVectorizer(max_features=1000)
bow = count_vectorizer.fit_transform(df["summary"])
X = df["summary"]
y = df["genre"]

In [83]:
bow

<4657x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 342334 stored elements in Compressed Sparse Row format>

In [84]:
X_train, X_test, y_train, y_test = train_test_split(bow, y, test_size=0.2, random_state=4242)

In [86]:
mb = MultinomialNB()
mb.fit(X_train, y_train)
y_pred = mb.predict(X_test)
print("MultinomialNB: ", accuracy_score(y_test, y_pred))

MultinomialNB:  0.6223175965665236


In [87]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print("SVC: ", accuracy_score(y_test, y_pred))

SVC:  0.5472103004291845


In [88]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("RandomForest: ", accuracy_score(y_test, y_pred))

RandomForest:  0.5954935622317596


In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4242)

In [91]:
tfidf = TfidfVectorizer(max_features=10000)
X_train_scaled = tfidf.fit_transform(X_train.values.astype("U"))
X_test_scaled = tfidf.transform(X_test.values.astype("U"))

In [92]:
svc = SVC()
svc.fit(X_train_scaled, y_train)
y_pred = svc.predict(X_test_scaled)
print("SVC: ", accuracy_score(y_test, y_pred))

SVC:  0.6459227467811158


In [93]:
mb = MultinomialNB()
mb.fit(X_train_scaled, y_train)
y_pred = mb.predict(X_test_scaled)
print("MultinomialNB: ", accuracy_score(y_test, y_pred))

MultinomialNB:  0.5321888412017167


In [94]:
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)
y_pred = rf.predict(X_test_scaled)
print("RandomForest: ", accuracy_score(y_test, y_pred))

RandomForest:  0.6083690987124464


In [95]:
pickle.dump(svc, open("SVCmodel.pickle", "wb"))

In [96]:
pickle.dump(tfidf, open("TFIDFmodel.pickle", "wb"))