In [1]:
import pandas as pd
import spacy
import numpy as np
import en_core_web_sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn import svm, model_selection
from sklearn.compose import ColumnTransformer
from spacy.tokens import Token
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier
from src.funct import read_data as rd
from pathlib import Path

In [2]:
#Load SpaCy model
nlp = spacy.load("en_core_web_sm")

In [3]:
#Custom attribute calculated by custom component

Token.set_extension('tfidf', default='empty', force=True)

from spacy.language import Language
@Language.component("my_component")
def my_component(doc):
           
    for i, d in enumerate(doc):
        if i<(len(doc)-1):
            d._.set('tfidf', d.lemma_+'~'+d.shape_+'~'+d.pos_+'~'+doc[i+1].pos_)
        else:
            d._.set('tfidf', d.lemma_+'~'+d.shape_+'~'+d.pos_)
    return doc

In [4]:
nlp.add_pipe('my_component', name="print_length", last=True)

<function __main__.my_component(doc)>

In [5]:
#file_path = '../data/raw/bbc-text.csv'
file_path = Path('.').absolute().parent / 'data'/'raw'/'bbc-text.csv'
train_data = rd.read_data(file_path,nlp)

In [6]:
# 25:75 Split of the training data
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(train_data['text'],train_data['category'], test_size = 0.25)

In [7]:
# Label encoder for the "Classes"
Encoder = LabelEncoder()
Encoder.fit(train_data['category'].astype(str))

Train_Y_LE = Encoder.transform(Train_Y.astype(str))
Test_Y_LE = Encoder.transform(Test_Y.astype(str))

In [8]:
#TfIDF vectorisation of the feature sets to be passed on for the classifier
# Fitting on the complete train data and transforming the 25% and 75% data & Test data
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(train_data['text'].astype(str))

Train_X_Tfidf = Tfidf_vect.transform(Train_X.astype(str))
Test_X_Tfidf = Tfidf_vect.transform(Test_X.astype(str))

In [9]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
knn = KNeighborsClassifier(n_neighbors = 2, weights = 'distance')
RF = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)

In [10]:
def classifier(model, trained_X, trained_y, testing_X):
    model.fit(trained_X, trained_y)
    predicted = model.predict(testing_X)
    return predicted

In [11]:
def evaluate(model, predicted, truth):
    precision_metric = precision_score(truth, predicted, average = "macro")
    print(model, " precision: ", precision_metric)
    recall_metric = recall_score(truth, predicted, average = "macro")
    print(model, " recall: ", recall_metric)
    accuracy_metric = accuracy_score(truth, predicted)
    print(model, " accuracy: ", accuracy_metric)

In [None]:
# 10 fold CV
scores = cross_val_score(SVM, Train_X_Tfidf,Train_Y_LE, cv=10,scoring='f1_macro')
print("SVM Average f1 score", np.mean(scores))

In [None]:
predictions_SVM1 = classifier(SVM,Train_X_Tfidf,Train_Y_LE,Test_X_Tfidf)
evaluate('SVM', predictions_SVM1, Test_Y_LE)

In [None]:
predictions_knn = classifier(knn,Train_X_Tfidf,Train_Y_LE,Test_X_Tfidf)
evaluate('KNN', predictions_knn, Test_Y_LE)

In [None]:
rf_predictions = classifier(RF,Train_X_Tfidf,Train_Y_LE,Test_X_Tfidf) 
evaluate('RF', rf_predictions, Test_Y_LE)