In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(500)

In [3]:
file1 = "./data/spam.csv"
file2 = "./data/imdb.csv"
file3 = "./data/yelp.csv"
df = pd.read_csv(file1, delimiter=',')

In [4]:
df = df.sample(frac=1)


In [6]:
df['content'] = [word_tokenize(entry) for entry in df['content']]

In [7]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [8]:
for index,entry in enumerate(df['content']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

In [53]:
df['text_final']

27668    ['I', 'think', 'movie', 'hysterical', 'I', 'wa...
28404    ['Another', 'episode', 'childhood', 'adult', '...
22285    ['The', 'Man', 'Who', 'Knew', 'Too', 'Much', '...
5046     ['So', 'come', 'In', 'comment', 'flick', 'some...
28648    ['Larry', 'perfect', 'example', 'Democratic', ...
                               ...                        
41155    ['I', 'start', 'admit', 'I', 'enjoy', 'many', ...
11199    ['I', 'get', 'DVD', 'well', 'year', 'ago', 'de...
22003    ['Without', 'doubt', 'one', 'bad', 'film', 'I'...
14901    ['great', 'movie', 'like', 'ning', 'climb', 'g...
48555    ['This', 'movie', 'flaw', 'many', 'front', 'Li...
Name: text_final, Length: 50000, dtype: object

In [9]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['text_final'],df['category'],test_size=0.2)

In [10]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [11]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

SVC(gamma='auto', kernel='linear')

In [12]:
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  88.25112107623319
