In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

### LinearSVC with TfIdf vectorizer

In [5]:
class  AuthorClassifier:

    def __init__(self):
        self.vectorizer = TfidfVectorizer(strip_accents='ascii', ngram_range=(1,2), norm='l2', sublinear_tf=True)
        self.classifier = LinearSVC(multi_class='ovr', max_iter=1500)
        self.x_test = []
        self.y_test = []

    def train(self, trainpath):
        df = pd.read_csv(trainpath, index_col=0)
        arr = df.to_numpy().reshape((-1,2))
        x_train, self.x_test, y_train, self.y_test = train_test_split(arr[:,0], arr[:,-1], train_size=.75)
        x_train = self.vectorizer.fit_transform(x_train)
#         print(x_train.shape)
        self.classifier.fit(x_train, y_train)
        # print("trained")
        
    def predict(self, testpath):
        # df = pd.read_csv(testpath, index_col=0)
        # arr = df.to_numpy().reshape((-1,2))
        # x_test = arr[:,0]
        # y_test = arr[:,-1]
        self.x_test = self.vectorizer.transform(self.x_test)
        pl = self.classifier.predict(self.x_test)
        print(classification_report(self.y_test,pl,zero_division=1))

auth_classifier = AuthorClassifier()
auth_classifier.train('./Datasets/Question-5/Train.csv')
auth_classifier.predict("")

              precision    recall  f1-score   support

         EAP       0.83      0.83      0.83      1602
         HPL       0.85      0.81      0.83      1120
         MWS       0.80      0.84      0.82      1194

    accuracy                           0.83      3916
   macro avg       0.83      0.83      0.83      3916
weighted avg       0.83      0.83      0.83      3916



^ Classification report for Linear SVC

### SGD classifier with TfIdf vectorizer

In [9]:
class  AuthorClassifier2:

    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        self.classifier = SGDClassifier(alpha=10)
        self.x_test = []
        self.y_test = []

    def train(self, trainpath):
        df = pd.read_csv(trainpath, index_col=0)
        arr = df.to_numpy().reshape((-1,2))
        x_train, self.x_test, y_train, self.y_test = train_test_split(arr[:,0], arr[:,-1], train_size=.75)
        x_train = self.vectorizer.fit_transform(x_train)
#         print(x_train.shape)
        self.classifier.fit(x_train, y_train)
        # print("trained")
        
    def predict(self, testpath):
        # df = pd.read_csv(testpath, index_col=0)
        # arr = df.to_numpy().reshape((-1,2))
        # x_test = arr[:,0]
        # y_test = arr[:,-1]
        self.x_test = self.vectorizer.transform(self.x_test)
        pl = self.classifier.predict(self.x_test)
        print(classification_report(self.y_test,pl,zero_division=1))

auth_classifier = AuthorClassifier2()
auth_classifier.train('./Datasets/Question-5/Train.csv')
auth_classifier.predict("")


              precision    recall  f1-score   support

         EAP       0.42      1.00      0.59      1651
         HPL       1.00      0.00      0.00      1067
         MWS       1.00      0.00      0.00      1198

    accuracy                           0.42      3916
   macro avg       0.81      0.33      0.20      3916
weighted avg       0.76      0.42      0.25      3916



^ Classification report for SGD classifier