In [1]:
#data Visualization 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Load Text Cleaning Pkgs and Transformers
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


# Load ML Pkgs
# Estimators
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [2]:
df = pd.read_csv("BBC News Train.csv")


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vaishnavi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vaishnavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vaishnavi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Vaishnavi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Text preprocess and Tokenizatin

In [30]:
def lemmatize(text):
    text = text.split()
    wordnet = WordNetLemmatizer()
    text = [wordnet.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
    return " ".join(text)

In [28]:
def remove_special_characters(text):
    text = re.sub("<br\\s*/?>", " ", text)
    text = re.sub("[^a-zA-Z']", " ", text)
    text = re.sub("-", " ", text)
    
    return text

In [29]:
def preprocess(data):
    combo = []
    for i in range(0,len(data)):
        text = remove_special_characters(data['Text'][i])
        text = text.lower()
        text = lemmatize(text)
        
        combo.append(text)
    data['Text_processed'] = pd.DataFrame(combo)
    return data

In [31]:
preprocess(df)

Unnamed: 0,ArticleId,Text,Category,Text_processed
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex bos launch defence lawyer defendin...
1,154,german business confidence slides german busin...,business,german business confidence slide german busine...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizen majo...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...
4,917,enron bosses in $168m payout eighteen former e...,business,enron boss payout eighteen former enron direct...
...,...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment,double eviction big brother model caprice holb...
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment,dj double act revamp chart show dj duo jk joel...
1487,1590,weak dollar hits reuters revenues at media gro...,business,weak dollar hit reuters revenue medium group r...
1488,1587,apple ipod family expands market apple has exp...,tech,apple ipod family expands market apple expande...


In [110]:
df.to_csv('data_processed.csv', index=False )

# label the column 'Category' for learning 

In [98]:

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'Category'.
df['Category_id']= label_encoder.fit_transform(df['Category'])
  

In [99]:
X_train,X_test,y_train,y_test = train_test_split(df['Text_processed'],
                                                 df['Category_id'],
                                                test_size=0.20,
                                                random_state=42)

In [100]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1192,)
(298,)
(1192,)
(298,)


# tfidf Vectorizer for feature Extraction 

In [101]:

tfidf = TfidfVectorizer(max_features=1000)

In [102]:
# transforming train and text data seprately to avoid overfitting

In [103]:
X_trn = tfidf.fit_transform(X_train).toarray()

In [104]:
X_trn.shape

(1192, 1000)

In [105]:
X_tst= tfidf.fit_transform(X_test).toarray()

In [106]:
X_tst.shape

(298, 1000)

# Different ML algorithms to train dataset and test accuracy 

In [107]:

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression",
         "Naive Bayes", "SVM "]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    MultinomialNB(),
    SVC()
]

models = zip(names, classifiers)

In [108]:
for name, model in models:
    nlp_model=model
    nlp_model.fit(X_trn,y_train)
    y_pred = nlp_model.predict(X_tst)
    test_accuracy  = accuracy_score(y_test, y_pred)
    print('{} Accuracy: {} '.format(name, test_accuracy))
    

K Nearest Neighbors Accuracy: 0.24161073825503357 
Decision Tree Accuracy: 0.19463087248322147 
Random Forest Accuracy: 0.2348993288590604 
Logistic Regression Accuracy: 0.28523489932885904 
Naive Bayes Accuracy: 0.28859060402684567 
SVM  Accuracy: 0.2684563758389262 


In [None]:
#the test data accuracy is too low -->> perform hyperparameter tuning on Logistic Regression , Naive Bayes ,SVM 

In [55]:
# create a pipeline choose best tfidf parameter with best ml classifier
X = df['Text'][:1192]

# svc pipeline model 1

In [56]:
# hyperparameter tuning of svm model  
pipeline_svm = Pipeline([
    ('vect', TfidfVectorizer( max_features=1000, stop_words='english' )),
    ('svm', SVC())
])

In [57]:
parameters_svm = {
    
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'svm__C': [0.1, 1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'svm__kernel': ['rbf','linear']    
}

In [58]:

grid = GridSearchCV(pipeline_svm, parameters_svm, cv=2, n_jobs=2, verbose=3 ,scoring='accuracy')
grid.fit(X, y_train)

Fitting 2 folds for each of 150 candidates, totalling 300 fits


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False,
                                                        max_features=1000,
                                                        stop_words='english')),
                                       ('svm', SVC())]),
             n_jobs=2,
             param_grid={'svm__C': [0.1, 1, 10, 100, 1000],
                         'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'svm__kernel': ['rbf', 'linear'],
                         'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]},
             scoring='accuracy', verbose=3)

In [65]:
print("Best parameters set:")
print(grid.best_estimator_)



Best parameters set:
Pipeline(steps=[('vect',
                 TfidfVectorizer(lowercase=False, max_features=1000,
                                 stop_words='english')),
                ('svm', SVC(C=1, gamma=1))])


In [67]:
y_pred = grid.predict(X_test)
print("SVM Test Accuracy: {}".format(accuracy_score(y_test,y_pred)))

SVM Test Accuracy: 0.12751677852348994


# LogisticRegression pipeline model 2

In [70]:
pipe_lr = Pipeline([
    ('vect', TfidfVectorizer(max_features=1000)),
    ('classifier', LogisticRegression())
])

In [71]:
##LogisticRegression solvers don't allow L1 penalty
param_lr = {
             "classifier__penalty": ['l2'],
             "classifier__C": np.logspace(0, 4, 10),
             "classifier__solver":['newton-cg','saga','sag','liblinear'], 
             "vect__max_df": (0.25, 0.5, 0.75),
             "vect__ngram_range": [(1, 1), (1, 2), (1, 3)]
             }

In [73]:
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe_lr, param_lr , cv=2, verbose=0,n_jobs=-1,scoring='accuracy') # Fit grid search
gridsearch.fit(X_train,y_train)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(max_features=1000)),
                                       ('classifier', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'classifier__C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'classifier__penalty': ['l2'],
                         'classifier__solver': ['newton-cg', 'saga', 'sag',
                                                'liblinear'],
                         'vect__max_df': (0.25, 0.5, 0.75),
                         'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]},
             scoring='accuracy')

In [82]:

print(gridsearch.best_estimator_)
print("Best parameters set:")
print(gridsearch.best_params_)
print("Best Score:")
print(gridsearch.best_score_)


Pipeline(steps=[('vect',
                 TfidfVectorizer(max_df=0.5, max_features=1000,
                                 ngram_range=(1, 2))),
                ('classifier',
                 LogisticRegression(C=7.742636826811269, solver='newton-cg'))])
Best parameters set:
{'classifier__C': 7.742636826811269, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2)}
Best Score:
0.9672818791946309


In [77]:
y_pred = gridsearch.predict(X_test)
print("LogisticRegression Test Accuracy: {}".format(accuracy_score(y_test,y_pred)))

LogisticRegression Test Accuracy: 0.9765100671140939


# Naive Bayes pipeline model 3

In [84]:
pipe_mnb = Pipeline([
    ('vect', TfidfVectorizer(max_features=1000)),
    ('mnb', MultinomialNB())
])

In [86]:
param_mnb = {
            'mnb__alpha': np.linspace(0.5, 1.5, 6),
            'mnb__fit_prior': [True, False],
            'vect__norm': [None, 'l1', 'l2'],
             'vect__max_df': (0.25, 0.5, 0.75),
             'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
             }

In [88]:
# create a gridsearch of the pipeline, the fit the best model
gridmnb = GridSearchCV(pipe_mnb, param_mnb , cv=2, verbose=0,n_jobs=-1,scoring='accuracy') # Fit grid search
gridmnb.fit(X_train,y_train)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(max_features=1000)),
                                       ('mnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'mnb__alpha': array([0.5, 0.7, 0.9, 1.1, 1.3, 1.5]),
                         'mnb__fit_prior': [True, False],
                         'vect__max_df': (0.25, 0.5, 0.75),
                         'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                         'vect__norm': [None, 'l1', 'l2']},
             scoring='accuracy')

In [89]:

print("Best parameters set:")
print(gridmnb.best_params_)
print("Best Score:")
print(gridmnb.best_score_)

Best parameters set:
{'mnb__alpha': 0.5, 'mnb__fit_prior': True, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 3), 'vect__norm': 'l2'}
Best Score:
0.9614093959731544


In [91]:
y_predmnb = gridmnb.predict(X_test)
print("Naive Bayes Test Accuracy: {}".format(accuracy_score(y_test,y_predmnb)))

Naive Bayes Test Accuracy: 0.9664429530201343
