In [None]:
import pandas as pd
import numpy as np
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
nltk.download('punkt')

In [None]:
data=pd.read_csv('movie.csv')

In [None]:
data.head()

In [None]:
len(data)

In [None]:
# Exploratory analysis

In [None]:
data['label'].value_counts()

In [None]:
y=data['label']
X=data['text']

In [None]:
# Tokenization

In [None]:
tokenized_sentences=[]
i=0
while i<len(X):
    tokenized_sentences.append(word_tokenize(X[i]))
    i=i+1

In [None]:
# Lower case conversion

In [None]:
for sent in tokenized_sentences:
    m=0
    while m<len(sent):
        sent[m] = re.sub(r"[^a-zA-Z0-9]", " ", sent[m].lower())
        m=m+1

In [None]:
stopwords_en=stopwords.words('english')
stopwords_en.append(' ')

In [None]:
words=[]
for sent in tokenized_sentences:
    words.append([w for w in sent if w not in stopwords_en])

In [None]:
# Stemming

In [None]:
for sent in words:
    i=0
    while i<len(sent):
        sent[i]=(PorterStemmer().stem(sent[i]))
        i=i+1 

In [None]:
#Vectorizing

In [None]:
all_words=[]
for sent in words:
    i=0
    while i<len(sent):
        all_words.append(sent[i])
        i=i+1

In [None]:
words_freq={}
for word in all_words:
    if word not in words_freq.keys():
        words_freq[word]=1
    elif word in words_freq.keys():
        words_freq[word]+=1
        

In [None]:
import heapq
most_freq = heapq.nlargest(1000, words_freq, key=words_freq.get)

In [None]:
sentence_vectors = []
for sentence in words:
    sent_vec = []
    for token in most_freq:
        if token in sentence:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    print(sent_vec)       
    sentence_vectors.append(sent_vec)

In [None]:
len(sentence_vectors)

In [None]:
X_train=sentence_vectors[:30000]
X_test=sentence_vectors[30000:]

In [None]:
y_train=y[:30000]
y_test=y[30000:]

In [None]:
# KNN

In [None]:
knn=KNeighborsClassifier(n_neighbors=9)

In [None]:
knn.fit(X_train,y_train)

In [None]:
knn_preds=knn.predict(X_test)

In [None]:
accuracy_score(y_test,knn_preds)

In [None]:
# Logistic regression

In [None]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
lr_preds=lr.predict(X_test)

In [None]:
accuracy_score(y_test,lr_preds)

In [None]:
# Naibe bayes

In [None]:
gnb=GaussianNB()
gnb.fit(X_train,y_train)
gnb_preds=gnb.predict(X_test)

In [None]:
accuracy_score(y_test,gnb_preds)

In [None]:
# SVM

In [None]:
svc=SVC()
svc.fit(X_train,y_train)
svc_preds=svc.predict(X_test)

In [None]:
accuracy_score(y_test,svc_preds)

In [None]:
# Decision tree

In [None]:
dec_tree=DecisionTreeClassifier()
dec_tree.fit(X_train,y_train)
dec_preds=dec_tree.predict(X_test)

In [None]:
accuracy_score(y_test,dec_preds)