In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm
import joblib
import pickle

In [2]:
def multinomialNaiveBayes(X_train, X_test, y_train, y_test):
    nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

    tempNB = nb.fit(X_train, y_train)
    yPred = nb.predict(X_test)
    return yPred

In [3]:
def logisticRegression(X_train, X_test, y_train, y_test):
    logreg = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LogisticRegression(n_jobs=1, C=1e9)),
                   ])
    tempLR = logreg.fit(X_train, y_train)
    yPred = logreg.predict(X_test)
    return yPred

In [4]:
def randomForest(X_train, X_test, y_train, y_test):
    ranfor = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                      ])
    tempRM = ranfor.fit(X_train, y_train)
    yPred = ranfor.predict(X_test)
    return yPred

In [5]:
def multiLayerPerceptron(X_train, X_test, y_train, y_test):
    mlp= Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                     ])
    tempMLP = mlp.fit(X_train, y_train)
    yPred = mlp.predict(X_test)
    return yPred

In [6]:
def mySVM(X_train, X_test, y_train, y_test):
    svmNew = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', svm.SVC(kernel='linear', C=1.0)),
                   ])
    SVM = svmNew.fit(X_train, y_train)
    yPred = svmNew.predict(X_test)
    return yPred

## Fetching and Splitting Cleaned Data into Test and Train Sets

In [7]:
flairs = flairs= ["Scheduled","Politics","Photography","Policy/Economy","AskIndia","Sports",
         "Non-Political","Science/Technology","Food","Business/Finance","Coronavirus"]
data = pd.read_csv('../data/cleansedData300.csv')
data["combined"] = data.title.astype("str")+" "+ data.url.astype("str")+" "+ data.author.astype("str")+" "+data.authors.astype("str")+" "+ data.comments.astype("str")
y = data.flair
X = data.combined
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1,random_state = 5)

## Performing Machine Learning Algorithms

In [8]:
accNB = multinomialNaiveBayes(X_train, X_test, y_train, y_test)
print("Multi-Nomial Naive Bayes Accuracy: ",accuracy_score(accNB, y_test))
print()
print(classification_report(y_test, accNB,target_names=flairs))

Multi-Nomial Naive Bayes Accuracy:  0.6666666666666666

                    precision    recall  f1-score   support

         Scheduled       0.50      0.68      0.58        28
          Politics       0.70      0.48      0.57        29
       Photography       0.65      0.81      0.72        32
    Policy/Economy       0.67      0.63      0.65        19
          AskIndia       0.33      0.21      0.26        24
            Sports       0.84      0.93      0.88        28
     Non-Political       0.42      0.59      0.49        17
Science/Technology       0.62      0.67      0.64        24
              Food       0.96      1.00      0.98        27
  Business/Finance       0.79      0.60      0.68        25
       Coronavirus       1.00      0.57      0.73        14

          accuracy                           0.67       267
         macro avg       0.68      0.65      0.65       267
      weighted avg       0.68      0.67      0.66       267



In [9]:
accLR = logisticRegression(X_train, X_test, y_train, y_test)
print("Logistic Regression Accuracy: ",accuracy_score(accLR, y_test))
print()
print(classification_report(y_test, accLR,target_names=flairs))

Logistic Regression Accuracy:  0.6367041198501873

                    precision    recall  f1-score   support

         Scheduled       0.44      0.54      0.48        28
          Politics       0.68      0.52      0.59        29
       Photography       0.69      0.69      0.69        32
    Policy/Economy       0.52      0.63      0.57        19
          AskIndia       0.41      0.38      0.39        24
            Sports       0.82      0.82      0.82        28
     Non-Political       0.42      0.59      0.49        17
Science/Technology       0.67      0.58      0.62        24
              Food       0.90      1.00      0.95        27
  Business/Finance       0.68      0.60      0.64        25
       Coronavirus       0.89      0.57      0.70        14

          accuracy                           0.64       267
         macro avg       0.65      0.63      0.63       267
      weighted avg       0.65      0.64      0.64       267



In [10]:
accRF = randomForest(X_train, X_test, y_train, y_test)
print("Random Forest Accuracy: ",accuracy_score(accRF, y_test))
print()
print(classification_report(y_test, accRF,target_names=flairs))

Random Forest Accuracy:  0.5655430711610487

                    precision    recall  f1-score   support

         Scheduled       0.49      0.68      0.57        28
          Politics       0.44      0.41      0.43        29
       Photography       0.51      0.81      0.63        32
    Policy/Economy       0.26      0.26      0.26        19
          AskIndia       1.00      0.04      0.08        24
            Sports       0.54      0.96      0.69        28
     Non-Political       0.62      0.29      0.40        17
Science/Technology       0.79      0.46      0.58        24
              Food       1.00      1.00      1.00        27
  Business/Finance       0.48      0.48      0.48        25
       Coronavirus       1.00      0.43      0.60        14

          accuracy                           0.57       267
         macro avg       0.65      0.53      0.52       267
      weighted avg       0.63      0.57      0.53       267



In [11]:
accMLP = multiLayerPerceptron(X_train, X_test, y_train, y_test)
print("Multi Layer Perceptron Accuracy: ",accuracy_score(accMLP, y_test))
print()
print(classification_report(y_test, accMLP,target_names=flairs))

Multi Layer Perceptron Accuracy:  0.5205992509363296

                    precision    recall  f1-score   support

         Scheduled       0.38      0.21      0.27        28
          Politics       0.64      0.24      0.35        29
       Photography       0.68      0.53      0.60        32
    Policy/Economy       0.46      0.68      0.55        19
          AskIndia       0.26      0.29      0.27        24
            Sports       0.81      0.61      0.69        28
     Non-Political       0.26      0.71      0.37        17
Science/Technology       0.61      0.46      0.52        24
              Food       1.00      1.00      1.00        27
  Business/Finance       0.35      0.52      0.42        25
       Coronavirus       0.90      0.64      0.75        14

          accuracy                           0.52       267
         macro avg       0.58      0.54      0.53       267
      weighted avg       0.58      0.52      0.53       267



In [12]:
accSVM = mySVM(X_train, X_test, y_train, y_test)
print("SVM Accuracy: ",accuracy_score(accSVM, y_test))
print()
print(classification_report(y_test, accMLP,target_names=flairs))

SVM Accuracy:  0.6367041198501873

                    precision    recall  f1-score   support

         Scheduled       0.38      0.21      0.27        28
          Politics       0.64      0.24      0.35        29
       Photography       0.68      0.53      0.60        32
    Policy/Economy       0.46      0.68      0.55        19
          AskIndia       0.26      0.29      0.27        24
            Sports       0.81      0.61      0.69        28
     Non-Political       0.26      0.71      0.37        17
Science/Technology       0.61      0.46      0.52        24
              Food       1.00      1.00      1.00        27
  Business/Finance       0.35      0.52      0.42        25
       Coronavirus       0.90      0.64      0.75        14

          accuracy                           0.52       267
         macro avg       0.58      0.54      0.53       267
      weighted avg       0.58      0.52      0.53       267



## Saving Naive Bayes Model since it was the best of all

In [13]:
nb = Pipeline([('vect', CountVectorizer()),
           ('tfidf', TfidfTransformer()),
           ('clf', MultinomialNB()),
          ])

tempNB = nb.fit(X_train, y_train)
pickle.dump(tempNB,open("../data/finalModel.sav","wb"))