In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,confusion_matrix,recall_score,f1_score,accuracy_score,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
# importing csv file
authors_df=pd.read_csv('Authors_dataset.csv')
authors_df['author'].value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [3]:
# assigning y to the output variable
y=authors_df['author']
authors_df.drop(columns='author', inplace=True)

In [50]:
# drop the columns with unique values
authors_df.drop(columns='id', inplace=True)

In [51]:
# train test split
x_train,x_test,y_train,y_test=train_test_split(authors_df,y, test_size=0.1, random_state=42)

In [52]:
regexp=RegexpTokenizer(r'\w+')
stopwords_en=stopwords.words('english')
lemmatizer=WordNetLemmatizer()
vectorizer=TfidfVectorizer()

In [53]:
# removing stop words & lemmatizing tokens by preprocessing the data
def preprocessing(text):
    tokens=regexp.tokenize(text)
    pure_tokens=[token.lower() for token in tokens if token.lower() not in stopwords_en]
    lemma_tokens=[lemmatizer.lemmatize(pure_token, pos='v') for pure_token in pure_tokens]
    return ' '.join(lemma_tokens)

In [54]:
# applying the preprocessing to the train and test data 
x_train['text']=x_train['text'].apply(preprocessing)
x_test['text']=x_test['text'].apply(preprocessing)

In [55]:
# TF-IDF vectorization 
x_train_tfidf=vectorizer.fit_transform(x_train['text'])
x_test_tfidf=vectorizer.transform(x_test['text'])

In [10]:
# Logistic Regression
logreg=LogisticRegression()
logreg.fit(x_train_tfidf,y_train)
logreg_pred=logreg.predict(x_test_tfidf)

In [11]:
# Evaluation of Logistic Regression
print('Confusion Matrix', confusion_matrix(y_test,logreg_pred))

Confusion Matrix [[666  57  54]
 [ 86 415  23]
 [108  44 505]]


In [12]:
# Classification Report on Logistic Regression
print(classification_report(y_test,logreg_pred))

              precision    recall  f1-score   support

         EAP       0.77      0.86      0.81       777
         HPL       0.80      0.79      0.80       524
         MWS       0.87      0.77      0.82       657

    accuracy                           0.81      1958
   macro avg       0.82      0.81      0.81      1958
weighted avg       0.81      0.81      0.81      1958



In [61]:
tree=DecisionTreeClassifier()
forest=RandomForestClassifier(random_state=42)
adaboost=AdaBoostClassifier(random_state=42)
gradientbc=GradientBoostingClassifier(random_state=42)
svc=SVC(random_state=42)
multinomial=MultinomialNB()

In [14]:
# Decision Tree Classifier and evaluation
tree.fit(x_train_tfidf,y_train)
tree_pred=tree.predict(x_test_tfidf)
train_score_tree=tree.score(x_train_tfidf,y_train)
test_score_tree=tree.score(x_test_tfidf,y_test)

In [15]:
# Classification Report on DecisionTree Classifier
print(classification_report(y_test,tree_pred))

              precision    recall  f1-score   support

         EAP       0.62      0.62      0.62       777
         HPL       0.51      0.54      0.53       524
         MWS       0.59      0.56      0.57       657

    accuracy                           0.58      1958
   macro avg       0.57      0.57      0.57      1958
weighted avg       0.58      0.58      0.58      1958



In [16]:
# Random Forest Classifier and F1 Score
param_dict={'n_estimators': [100,120,110],
           'max_depth':[4,6,8],
           'min_samples_split':[2,3,4],
           'max_features':['auto','sqrt','log2']}
rscv_forest=RandomizedSearchCV(forest,param_distributions=param_dict,cv=5,verbose=5, n_iter=1000)
rscv_forest.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4 
[CV]  n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4, score=0.407, total=   1.8s
[CV] n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


[CV]  n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4, score=0.408, total=   1.6s
[CV] n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.3s remaining:    0.0s


[CV]  n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4, score=0.406, total=   1.6s
[CV] n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.9s remaining:    0.0s


[CV]  n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4, score=0.409, total=   1.6s
[CV] n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.5s remaining:    0.0s


[CV]  n_estimators=100, min_samples_split=2, max_features=auto, max_depth=4, score=0.410, total=   1.6s
[CV] n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4, score=0.407, total=   2.0s
[CV] n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4, score=0.407, total=   2.1s
[CV] n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4, score=0.405, total=   1.9s
[CV] n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4, score=0.407, total=   1.8s
[CV] n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=auto, max_depth=4, score=0.409, total=   1

[CV]  n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=4, score=0.407, total=   1.8s
[CV] n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=4, score=0.407, total=   1.8s
[CV] n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=4, score=0.405, total=   1.8s
[CV] n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=4, score=0.407, total=   1.8s
[CV] n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=4, score=0.409, total=   1.8s
[CV] n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=4 
[CV]  n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=4, score=0.407, total=   1

[CV]  n_estimators=120, min_samples_split=2, max_features=log2, max_depth=4, score=0.404, total=   0.9s
[CV] n_estimators=120, min_samples_split=2, max_features=log2, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=log2, max_depth=4, score=0.404, total=   1.0s
[CV] n_estimators=120, min_samples_split=2, max_features=log2, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=log2, max_depth=4, score=0.404, total=   1.0s
[CV] n_estimators=120, min_samples_split=2, max_features=log2, max_depth=4 
[CV]  n_estimators=120, min_samples_split=2, max_features=log2, max_depth=4, score=0.404, total=   0.9s
[CV] n_estimators=110, min_samples_split=2, max_features=log2, max_depth=4 
[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=4, score=0.404, total=   0.9s
[CV] n_estimators=110, min_samples_split=2, max_features=log2, max_depth=4 
[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=4, score=0.404, total=   0

[CV]  n_estimators=120, min_samples_split=2, max_features=auto, max_depth=6, score=0.417, total=   2.5s
[CV] n_estimators=120, min_samples_split=2, max_features=auto, max_depth=6 
[CV]  n_estimators=120, min_samples_split=2, max_features=auto, max_depth=6, score=0.415, total=   2.5s
[CV] n_estimators=120, min_samples_split=2, max_features=auto, max_depth=6 
[CV]  n_estimators=120, min_samples_split=2, max_features=auto, max_depth=6, score=0.417, total=   2.5s
[CV] n_estimators=110, min_samples_split=2, max_features=auto, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=auto, max_depth=6, score=0.416, total=   2.3s
[CV] n_estimators=110, min_samples_split=2, max_features=auto, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=auto, max_depth=6, score=0.415, total=   2.3s
[CV] n_estimators=110, min_samples_split=2, max_features=auto, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=auto, max_depth=6, score=0.418, total=   2

[CV]  n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=6, score=0.415, total=   2.5s
[CV] n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=6 
[CV]  n_estimators=120, min_samples_split=2, max_features=sqrt, max_depth=6, score=0.417, total=   2.5s
[CV] n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=6, score=0.416, total=   2.3s
[CV] n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=6, score=0.415, total=   2.3s
[CV] n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=6, score=0.418, total=   2.3s
[CV] n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=6, score=0.416, total=   2

[CV]  n_estimators=120, min_samples_split=2, max_features=log2, max_depth=6, score=0.404, total=   1.2s
[CV] n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6, score=0.404, total=   1.1s
[CV] n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6, score=0.404, total=   1.1s
[CV] n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6, score=0.404, total=   1.1s
[CV] n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6, score=0.404, total=   1.1s
[CV] n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6 
[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=6, score=0.404, total=   1

[CV]  n_estimators=110, min_samples_split=2, max_features=auto, max_depth=8, score=0.432, total=   3.0s
[CV] n_estimators=110, min_samples_split=2, max_features=auto, max_depth=8 
[CV]  n_estimators=110, min_samples_split=2, max_features=auto, max_depth=8, score=0.430, total=   2.9s
[CV] n_estimators=110, min_samples_split=2, max_features=auto, max_depth=8 
[CV]  n_estimators=110, min_samples_split=2, max_features=auto, max_depth=8, score=0.435, total=   2.9s
[CV] n_estimators=110, min_samples_split=2, max_features=auto, max_depth=8 
[CV]  n_estimators=110, min_samples_split=2, max_features=auto, max_depth=8, score=0.434, total=   2.9s
[CV] n_estimators=110, min_samples_split=2, max_features=auto, max_depth=8 
[CV]  n_estimators=110, min_samples_split=2, max_features=auto, max_depth=8, score=0.431, total=   2.9s
[CV] n_estimators=100, min_samples_split=3, max_features=auto, max_depth=8 
[CV]  n_estimators=100, min_samples_split=3, max_features=auto, max_depth=8, score=0.434, total=   2

[CV]  n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=8, score=0.430, total=   3.1s
[CV] n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=8 
[CV]  n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=8, score=0.435, total=   3.0s
[CV] n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=8 
[CV]  n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=8, score=0.434, total=   3.0s
[CV] n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=8 
[CV]  n_estimators=110, min_samples_split=2, max_features=sqrt, max_depth=8, score=0.431, total=   3.0s
[CV] n_estimators=100, min_samples_split=3, max_features=sqrt, max_depth=8 
[CV]  n_estimators=100, min_samples_split=3, max_features=sqrt, max_depth=8, score=0.434, total=   2.7s
[CV] n_estimators=100, min_samples_split=3, max_features=sqrt, max_depth=8 
[CV]  n_estimators=100, min_samples_split=3, max_features=sqrt, max_depth=8, score=0.432, total=   2

[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=8, score=0.404, total=   1.3s
[CV] n_estimators=110, min_samples_split=2, max_features=log2, max_depth=8 
[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=8, score=0.404, total=   1.3s
[CV] n_estimators=110, min_samples_split=2, max_features=log2, max_depth=8 
[CV]  n_estimators=110, min_samples_split=2, max_features=log2, max_depth=8, score=0.404, total=   1.3s
[CV] n_estimators=100, min_samples_split=3, max_features=log2, max_depth=8 
[CV]  n_estimators=100, min_samples_split=3, max_features=log2, max_depth=8, score=0.404, total=   1.2s
[CV] n_estimators=100, min_samples_split=3, max_features=log2, max_depth=8 
[CV]  n_estimators=100, min_samples_split=3, max_features=log2, max_depth=8, score=0.404, total=   1.2s
[CV] n_estimators=100, min_samples_split=3, max_features=log2, max_depth=8 
[CV]  n_estimators=100, min_samples_split=3, max_features=log2, max_depth=8, score=0.404, total=   1

[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed: 12.9min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
                   n_iter=1000,
                   param_distributions={'max_depth': [4, 6, 8],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_split': [2, 3, 4],
                                        'n_estimators': [100, 120, 110]},
                   verbose=5)

In [25]:
for_pred=rscv_forest.predict(x_test_tfidf)

In [18]:
# Classification Report on RandomForest Classifier
print(classification_report(y_test,for_pred))

              precision    recall  f1-score   support

         EAP       0.41      1.00      0.58       777
         HPL       1.00      0.03      0.05       524
         MWS       0.98      0.08      0.15       657

    accuracy                           0.43      1958
   macro avg       0.80      0.37      0.26      1958
weighted avg       0.76      0.43      0.30      1958



In [19]:
# AdaBoost Classifier and F1 Score
para_dic={'learning_rate':[0.01,0.1,0.2,0.5],
          'n_estimators':[150,120,140]}
rscv_adaboost=RandomizedSearchCV(adaboost,param_distributions=para_dic,cv=5,verbose=5, n_iter=1000)
rscv_adaboost.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] n_estimators=150, learning_rate=0.01 ............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=150, learning_rate=0.01, score=0.453, total=  52.7s
[CV] n_estimators=150, learning_rate=0.01 ............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   52.6s remaining:    0.0s


[CV]  n_estimators=150, learning_rate=0.01, score=0.485, total=  53.7s
[CV] n_estimators=150, learning_rate=0.01 ............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s


[CV]  n_estimators=150, learning_rate=0.01, score=0.469, total=  53.0s
[CV] n_estimators=150, learning_rate=0.01 ............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.7min remaining:    0.0s


[CV]  n_estimators=150, learning_rate=0.01, score=0.471, total=  53.5s
[CV] n_estimators=150, learning_rate=0.01 ............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.5min remaining:    0.0s


[CV]  n_estimators=150, learning_rate=0.01, score=0.461, total=  53.3s
[CV] n_estimators=120, learning_rate=0.01 ............................
[CV]  n_estimators=120, learning_rate=0.01, score=0.453, total=  42.5s
[CV] n_estimators=120, learning_rate=0.01 ............................
[CV]  n_estimators=120, learning_rate=0.01, score=0.475, total=  42.1s
[CV] n_estimators=120, learning_rate=0.01 ............................
[CV]  n_estimators=120, learning_rate=0.01, score=0.463, total=  42.6s
[CV] n_estimators=120, learning_rate=0.01 ............................
[CV]  n_estimators=120, learning_rate=0.01, score=0.458, total=  43.9s
[CV] n_estimators=120, learning_rate=0.01 ............................
[CV]  n_estimators=120, learning_rate=0.01, score=0.459, total=  42.6s
[CV] n_estimators=140, learning_rate=0.01 ............................
[CV]  n_estimators=140, learning_rate=0.01, score=0.453, total=  48.7s
[CV] n_estimators=140, learning_rate=0.01 ............................
[CV]  

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 48.6min finished


RandomizedSearchCV(cv=5, estimator=AdaBoostClassifier(random_state=42),
                   n_iter=1000,
                   param_distributions={'learning_rate': [0.01, 0.1, 0.2, 0.5],
                                        'n_estimators': [150, 120, 140]},
                   verbose=5)

In [28]:
boost_pred=rscv_adaboost.predict(x_test_tfidf)

In [42]:
# Classification Report on AdaBoost Classifier
print(classification_report(y_test,boost_pred))

              precision    recall  f1-score   support

         EAP       0.55      0.88      0.68       777
         HPL       0.72      0.49      0.58       524
         MWS       0.82      0.45      0.58       657

    accuracy                           0.63      1958
   macro avg       0.70      0.61      0.61      1958
weighted avg       0.69      0.63      0.62      1958



In [20]:
# GradientBoosting Classifier and F1 Score
pa_dict={'n_estimators':[100,120,130],
        'learning_rate':[0.1,0.02,0.5,1],
        'max_features':['auto','sqrt','log2']}
rscv_gradientbc=RandomizedSearchCV(gradientbc, param_distributions=pa_dict, cv=5, verbose=5, n_iter=1000)
rscv_gradientbc.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] n_estimators=100, max_features=auto, learning_rate=0.1 ..........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=100, max_features=auto, learning_rate=0.1, score=0.629, total= 1.9min
[CV] n_estimators=100, max_features=auto, learning_rate=0.1 ..........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.9min remaining:    0.0s


[CV]  n_estimators=100, max_features=auto, learning_rate=0.1, score=0.629, total= 1.9min
[CV] n_estimators=100, max_features=auto, learning_rate=0.1 ..........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.8min remaining:    0.0s


[CV]  n_estimators=100, max_features=auto, learning_rate=0.1, score=0.618, total= 1.9min
[CV] n_estimators=100, max_features=auto, learning_rate=0.1 ..........


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.7min remaining:    0.0s


[CV]  n_estimators=100, max_features=auto, learning_rate=0.1, score=0.629, total= 1.9min
[CV] n_estimators=100, max_features=auto, learning_rate=0.1 ..........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  7.6min remaining:    0.0s


[CV]  n_estimators=100, max_features=auto, learning_rate=0.1, score=0.636, total= 1.9min
[CV] n_estimators=120, max_features=auto, learning_rate=0.1 ..........
[CV]  n_estimators=120, max_features=auto, learning_rate=0.1, score=0.641, total= 2.3min
[CV] n_estimators=120, max_features=auto, learning_rate=0.1 ..........
[CV]  n_estimators=120, max_features=auto, learning_rate=0.1, score=0.637, total= 2.3min
[CV] n_estimators=120, max_features=auto, learning_rate=0.1 ..........
[CV]  n_estimators=120, max_features=auto, learning_rate=0.1, score=0.626, total= 2.3min
[CV] n_estimators=120, max_features=auto, learning_rate=0.1 ..........
[CV]  n_estimators=120, max_features=auto, learning_rate=0.1, score=0.642, total= 2.3min
[CV] n_estimators=120, max_features=auto, learning_rate=0.1 ..........
[CV]  n_estimators=120, max_features=auto, learning_rate=0.1, score=0.645, total= 2.3min
[CV] n_estimators=130, max_features=auto, learning_rate=0.1 ..........
[CV]  n_estimators=130, max_features=aut

[CV]  n_estimators=130, max_features=auto, learning_rate=0.02, score=0.544, total= 2.4min
[CV] n_estimators=130, max_features=auto, learning_rate=0.02 .........
[CV]  n_estimators=130, max_features=auto, learning_rate=0.02, score=0.533, total= 2.6min
[CV] n_estimators=130, max_features=auto, learning_rate=0.02 .........
[CV]  n_estimators=130, max_features=auto, learning_rate=0.02, score=0.538, total= 2.9min
[CV] n_estimators=130, max_features=auto, learning_rate=0.02 .........
[CV]  n_estimators=130, max_features=auto, learning_rate=0.02, score=0.552, total= 2.8min
[CV] n_estimators=100, max_features=sqrt, learning_rate=0.02 .........
[CV]  n_estimators=100, max_features=sqrt, learning_rate=0.02, score=0.433, total=   7.3s
[CV] n_estimators=100, max_features=sqrt, learning_rate=0.02 .........
[CV]  n_estimators=100, max_features=sqrt, learning_rate=0.02, score=0.430, total=   7.2s
[CV] n_estimators=100, max_features=sqrt, learning_rate=0.02 .........
[CV]  n_estimators=100, max_featur

[CV]  n_estimators=100, max_features=sqrt, learning_rate=0.5, score=0.707, total=   6.9s
[CV] n_estimators=100, max_features=sqrt, learning_rate=0.5 ..........
[CV]  n_estimators=100, max_features=sqrt, learning_rate=0.5, score=0.710, total=   6.9s
[CV] n_estimators=100, max_features=sqrt, learning_rate=0.5 ..........
[CV]  n_estimators=100, max_features=sqrt, learning_rate=0.5, score=0.715, total=   6.9s
[CV] n_estimators=120, max_features=sqrt, learning_rate=0.5 ..........
[CV]  n_estimators=120, max_features=sqrt, learning_rate=0.5, score=0.723, total=   8.5s
[CV] n_estimators=120, max_features=sqrt, learning_rate=0.5 ..........
[CV]  n_estimators=120, max_features=sqrt, learning_rate=0.5, score=0.726, total=   9.0s
[CV] n_estimators=120, max_features=sqrt, learning_rate=0.5 ..........
[CV]  n_estimators=120, max_features=sqrt, learning_rate=0.5, score=0.720, total=   8.7s
[CV] n_estimators=120, max_features=sqrt, learning_rate=0.5 ..........
[CV]  n_estimators=120, max_features=sqr

[CV]  n_estimators=120, max_features=sqrt, learning_rate=1, score=0.720, total=   8.3s
[CV] n_estimators=130, max_features=sqrt, learning_rate=1 ............
[CV]  n_estimators=130, max_features=sqrt, learning_rate=1, score=0.710, total=   9.0s
[CV] n_estimators=130, max_features=sqrt, learning_rate=1 ............
[CV]  n_estimators=130, max_features=sqrt, learning_rate=1, score=0.718, total=   9.0s
[CV] n_estimators=130, max_features=sqrt, learning_rate=1 ............
[CV]  n_estimators=130, max_features=sqrt, learning_rate=1, score=0.717, total=   9.1s
[CV] n_estimators=130, max_features=sqrt, learning_rate=1 ............
[CV]  n_estimators=130, max_features=sqrt, learning_rate=1, score=0.713, total=   9.0s
[CV] n_estimators=130, max_features=sqrt, learning_rate=1 ............
[CV]  n_estimators=130, max_features=sqrt, learning_rate=1, score=0.724, total=   9.0s
[CV] n_estimators=100, max_features=log2, learning_rate=1 ............
[CV]  n_estimators=100, max_features=log2, learning_

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 174.7min finished


RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=42),
                   n_iter=1000,
                   param_distributions={'learning_rate': [0.1, 0.02, 0.5, 1],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': [100, 120, 130]},
                   verbose=5)

In [33]:
grad_pred=rscv_gradientbc.predict(x_test_tfidf)

In [43]:
# Classification Report on GradientBoosting Classifier
print(classification_report(y_test,grad_pred))

              precision    recall  f1-score   support

         EAP       0.65      0.87      0.74       777
         HPL       0.78      0.65      0.71       524
         MWS       0.82      0.59      0.69       657

    accuracy                           0.72      1958
   macro avg       0.75      0.70      0.71      1958
weighted avg       0.74      0.72      0.71      1958



In [21]:
# Support Vector Machine(SVC) and F1 Score
p_dict={'C':[0.001,0.01,0.1,1,10],
     'gamma':['scale','auto'],
     'kernel':['linear','rbf','sigmoid']}

rscv_svm=RandomizedSearchCV(svc, param_distributions=p_dict, cv=5,verbose=3, n_iter=500)
rscv_svm.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] kernel=linear, gamma=scale, C=0.001 .............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . kernel=linear, gamma=scale, C=0.001, score=0.404, total= 1.0min
[CV] kernel=linear, gamma=scale, C=0.001 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min remaining:    0.0s


[CV] . kernel=linear, gamma=scale, C=0.001, score=0.404, total= 1.0min
[CV] kernel=linear, gamma=scale, C=0.001 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.0min remaining:    0.0s


[CV] . kernel=linear, gamma=scale, C=0.001, score=0.404, total= 1.0min
[CV] kernel=linear, gamma=scale, C=0.001 .............................
[CV] . kernel=linear, gamma=scale, C=0.001, score=0.404, total= 1.0min
[CV] kernel=linear, gamma=scale, C=0.001 .............................
[CV] . kernel=linear, gamma=scale, C=0.001, score=0.404, total= 1.0min
[CV] kernel=rbf, gamma=scale, C=0.001 ................................
[CV] .... kernel=rbf, gamma=scale, C=0.001, score=0.404, total= 1.1min
[CV] kernel=rbf, gamma=scale, C=0.001 ................................
[CV] .... kernel=rbf, gamma=scale, C=0.001, score=0.404, total= 1.1min
[CV] kernel=rbf, gamma=scale, C=0.001 ................................
[CV] .... kernel=rbf, gamma=scale, C=0.001, score=0.404, total= 1.1min
[CV] kernel=rbf, gamma=scale, C=0.001 ................................
[CV] .... kernel=rbf, gamma=scale, C=0.001, score=0.404, total= 1.1min
[CV] kernel=rbf, gamma=scale, C=0.001 ................................
[CV] .

[CV] ... kernel=linear, gamma=scale, C=0.1, score=0.669, total= 1.0min
[CV] kernel=linear, gamma=scale, C=0.1 ...............................
[CV] ... kernel=linear, gamma=scale, C=0.1, score=0.676, total=  59.7s
[CV] kernel=linear, gamma=scale, C=0.1 ...............................
[CV] ... kernel=linear, gamma=scale, C=0.1, score=0.669, total=  58.9s
[CV] kernel=linear, gamma=scale, C=0.1 ...............................
[CV] ... kernel=linear, gamma=scale, C=0.1, score=0.664, total= 1.0min
[CV] kernel=linear, gamma=scale, C=0.1 ...............................
[CV] ... kernel=linear, gamma=scale, C=0.1, score=0.667, total=  59.4s
[CV] kernel=rbf, gamma=scale, C=0.1 ..................................
[CV] ...... kernel=rbf, gamma=scale, C=0.1, score=0.410, total= 1.3min
[CV] kernel=rbf, gamma=scale, C=0.1 ..................................
[CV] ...... kernel=rbf, gamma=scale, C=0.1, score=0.408, total= 1.3min
[CV] kernel=rbf, gamma=scale, C=0.1 ..................................
[CV] .

[CV] ..... kernel=sigmoid, gamma=auto, C=1, score=0.404, total= 1.1min
[CV] kernel=sigmoid, gamma=auto, C=1 .................................
[CV] ..... kernel=sigmoid, gamma=auto, C=1, score=0.404, total= 1.1min
[CV] kernel=linear, gamma=scale, C=10 ................................
[CV] .... kernel=linear, gamma=scale, C=10, score=0.763, total=  40.3s
[CV] kernel=linear, gamma=scale, C=10 ................................
[CV] .... kernel=linear, gamma=scale, C=10, score=0.768, total=  41.5s
[CV] kernel=linear, gamma=scale, C=10 ................................
[CV] .... kernel=linear, gamma=scale, C=10, score=0.765, total=  42.9s
[CV] kernel=linear, gamma=scale, C=10 ................................
[CV] .... kernel=linear, gamma=scale, C=10, score=0.765, total=  40.9s
[CV] kernel=linear, gamma=scale, C=10 ................................
[CV] .... kernel=linear, gamma=scale, C=10, score=0.764, total=  43.7s
[CV] kernel=rbf, gamma=scale, C=10 ...................................
[CV] .

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 165.9min finished


RandomizedSearchCV(cv=5, estimator=SVC(random_state=42), n_iter=500,
                   param_distributions={'C': [0.001, 0.01, 0.1, 1, 10],
                                        'gamma': ['scale', 'auto'],
                                        'kernel': ['linear', 'rbf', 'sigmoid']},
                   verbose=3)

In [30]:
svc_pred=rscv_svm.predict(x_test_tfidf)

In [44]:
# Classification Report on Support Vector Machine(SVC)
print(classification_report(y_test,svc_pred))

              precision    recall  f1-score   support

         EAP       0.78      0.86      0.81       777
         HPL       0.82      0.80      0.81       524
         MWS       0.87      0.78      0.82       657

    accuracy                           0.82      1958
   macro avg       0.82      0.81      0.81      1958
weighted avg       0.82      0.82      0.82      1958



In [62]:
# Multinomial Naive Bayes
multinomial.fit(x_train_tfidf,y_train)
multi_pred=multinomial.predict(x_test_tfidf)

In [63]:
# Classification Report on Multinomial Naive Bayes
print(classification_report(y_test,multi_pred))

              precision    recall  f1-score   support

         EAP       0.76      0.87      0.81       777
         HPL       0.87      0.73      0.79       524
         MWS       0.85      0.81      0.83       657

    accuracy                           0.81      1958
   macro avg       0.82      0.80      0.81      1958
weighted avg       0.82      0.81      0.81      1958

