In [1]:
import pandas as pd
import numpy as np
import collections
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import nltk
import re
from nltk.corpus import stopwords

In [2]:
data = pd.read_csv('english-train.csv')

In [3]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=stopwords.words('english')):

    text = re.sub('[0-9]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r"\b[a-zA-Z]\b", ' ', text)

    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)
    return text

In [4]:
data['headline'] = data.headline.apply(lambda x: preprocess_text(x))

In [5]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data['headline'],data['Label'],test_size=0.2,random_state=42)

In [6]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
print(Train_Y[0:10])
print(Train_X[0:10])

[4 4 4 1 0 0 4 2 3 1]
12346             oneplus lite oneplus name budget variant
13099    behind every galaxy camera click work done u s...
8957                oneplus pro perfect gifting duo diwali
37179    farhan akhtar accepts director taken backseat ...
20310       airline fly empty nowhere keep pilot certified
16675    satellite broadband communication received dot...
13430                        work home guide five webcam r
28569    india committed early settlement boundary issu...
5523     free kick messi show inswinger refined divine ...
33690    satyameva jayate trailer three john abraham ou...
Name: headline, dtype: object


In [7]:
Tfidf_vect = TfidfVectorizer() #6500
Tfidf_vect.fit(data['headline'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [8]:
SVM = svm.SVC(C = 10, gamma = 1, kernel = 'rbf')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  94.8121406605161


In [9]:
mat = confusion_matrix(predictions_SVM, Test_Y)
mat

array([[1472,    8,   36,   18,   54],
       [   5, 1413,   24,   36,   22],
       [  18,    6, 1476,    9,    9],
       [  16,   21,   25, 1404,   12],
       [  37,   17,    9,    6, 1326]], dtype=int64)

In [10]:
report = classification_report(predictions_SVM, Test_Y)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1588
           1       0.96      0.94      0.95      1500
           2       0.94      0.97      0.96      1518
           3       0.95      0.95      0.95      1478
           4       0.93      0.95      0.94      1395

    accuracy                           0.95      7479
   macro avg       0.95      0.95      0.95      7479
weighted avg       0.95      0.95      0.95      7479



In [11]:
import joblib
import os
loaded_model = joblib.dump(SVM, 'eng_svm.pkl')

In [None]:
SVM = joblib.load('eng_svm.pkl')

In [12]:
simple_test1 = ["T20 World Cup: Team India 'unwinds' day off with beach volleyball"]
Train_X_Tfidf = Tfidf_vect.transform(simple_test1)
pred1 = SVM.predict(Train_X_Tfidf)
pred1[0]
predictions_test = Encoder.inverse_transform(pred1)
predictions_test[0]

'Sports'

In [13]:
simple_test2 = ["What separates Priyanka Gandhi Vadra from Rahul Gandhi in UP assembly election"]
Train_X_Tfidf = Tfidf_vect.transform(simple_test2)
pred2 = SVM.predict(Train_X_Tfidf)
pred2[0]
predictions_test = Encoder.inverse_transform(pred2)
predictions_test[0]

'Politics'

In [14]:
simple_test3 = ["Big hit on India: ‘Higher oil, food prices; duty rollback may help"]
Train_X_Tfidf = Tfidf_vect.transform(simple_test3)
pred3 = SVM.predict(Train_X_Tfidf)
pred3[0]
predictions_test = Encoder.inverse_transform(pred3)
predictions_test[0]

'Business'

In [15]:
simple_test4 = ["Anushka Sharma dresses daughter Vamika as a fairy for Halloween, Soha Ali Khan turns Inaaya into a unicorn"]
Train_X_Tfidf = Tfidf_vect.transform(simple_test4)
pred4 = SVM.predict(Train_X_Tfidf)
pred4[0]
predictions_test = Encoder.inverse_transform(pred4)
predictions_test[0]

'Entertainment'

In [16]:
simple_test5 = ["Apple is expected to launch the iPhone SE 3 later this year."]
Train_X_Tfidf = Tfidf_vect.transform(simple_test5)
pred5 = SVM.predict(Train_X_Tfidf)
pred5[0]
predictions_test = Encoder.inverse_transform(pred5)
predictions_test[0]

'Tech'