In [2]:
#Text Only Analysis 
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34
    
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

#load data 
Fakeddit = pd.read_csv("all_train.csv", encoding='latin-1')

#Filter to text only 
Fakeddit = Fakeddit[Fakeddit.image_url.isnull() & Fakeddit.clean_title.notnull()]

#Drop any remaining NAs 
Fakeddit['clean_title'].dropna(inplace=True) 

#Reset index
Fakeddit.reset_index()

#Select rows 
Fakeddit_select = Fakeddit[["clean_title", "2_way_label"]] 

#Sample data 
np.random.seed(100)
Fakeddit_sample = Fakeddit_select.groupby('2_way_label', group_keys=False).apply(lambda x: x.sample(5000))
Fakeddit_sample = Fakeddit_sample.sample(frac = 1) 

#Reset index 
Fakeddit_sample.reset_index(inplace = True)
Fakeddit_sample = Fakeddit_sample[['clean_title', '2_way_label']]

#Tokenization 
Fakeddit_sample['clean_title'] = [word_tokenize(entry) for entry in Fakeddit_sample['clean_title']]

# Lemminization 

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(Fakeddit_sample['clean_title']):
    Final_words = [] 
    word_Lemmatized = WordNetLemmatizer() 
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    Fakeddit_sample.loc[index,'clean_title_final'] = str(Final_words)

Fakeddit_sample 


Unnamed: 0,clean_title,2_way_label,clean_title_final
0,"[loyal, patriot, salutes, the, might, of, red,...",0.0,"['loyal', 'patriot', 'salute', 'might', 'red',..."
1,"[more, than, diners, flee, spanish, restaurant...",1.0,"['diner', 'flee', 'spanish', 'restaurant', 'wi..."
2,"[here, are, turkish, lira, examples, and, must...",1.0,"['turkish', 'lira', 'example', 'mustafa', 'kem..."
3,"[can, someone, zombify, this]",1.0,"['someone', 'zombify']"
4,"[pft, nfl, coaches, who, went, to, work, at, n...",0.0,"['pft', 'nfl', 'coach', 'go', 'work', 'new', '..."
...,...,...,...
9995,"[this, womans, face, on, a, package]",0.0,"['womans', 'face', 'package']"
9996,"[skydiver, sacked, for, having, sex, during, a...",1.0,"['skydiver', 'sack', 'sex', 'tandem', 'jump']"
9997,"[australian, friend, broke, her, arm, falling,...",0.0,"['australian', 'friend', 'break', 'arm', 'fall..."
9998,"[this, corn, hole, craze, has, gotten, outta, ...",1.0,"['corn', 'hole', 'craze', 'get', 'outta', 'hand']"


In [7]:
#Train Test Split 
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(
    Fakeddit_sample['clean_title_final'], Fakeddit_sample['2_way_label'],test_size = 0.3)

Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

#TF IDF 
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Fakeddit_sample['clean_title_final']) 

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


In [11]:
#SVM models 

# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)



SVM Accuracy Score ->  69.33333333333334
