#### Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np

from preprocess import preprocess_text

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jaskaransingh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jaskaransingh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to punkt...
[nltk_data]   Package wordnet is already up-to-date!


#### Importing labelled and unlabelled data

In [2]:
cs_data = pd.read_csv("data/labelled/cs_data.csv")
api_unlab = pd.read_csv("data/unlabelled_data/api_unlabelled.csv")

#### Using 80% of the cs_data as unlabelled data only

In [3]:
from sklearn.model_selection import train_test_split as tts

cs_data = cs_data[['title', 'abstract','categories']]

labelled_data, unlabelled_data = tts(cs_data, train_size = 0.2, random_state=42)

In [4]:
unlabelled_data.drop(['categories'], inplace=True, axis=1)

#### Fetching the main columns from unlabelled data fetched using arXiV API

In [5]:
# Unlabelled Dataset from arxiv API

api_unlab.drop(['Unnamed: 0'], axis=1, inplace=True)
api_unlab['title'] = api_unlab['Title']
api_unlab['abstract'] = api_unlab['Abstract']

api_unlab.drop(['Title', 'Abstract'], inplace=True, axis=1)

api_unlab = api_unlab[['title', 'abstract']]

#### Combining the 80% of the cs_data as unlabelled only and the unlabelled data from arXiV API

In [6]:
unlabelled_data = unlabelled_data.append(api_unlab, ignore_index=True)

In [7]:
labelled_data.reset_index(inplace=True, drop=True)
unlabelled_data.reset_index(inplace=True, drop=True)

#### Importing from Semisupervised module created by us

In [8]:
from semi_supervised import Semisupervised

In [9]:
trainer = Semisupervised(labelled_data, unlabelled_data)

#### Logistic Regression as Base Model for Self Training Classifier

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
lg = LogisticRegression(penalty='l2', C=10)

In [12]:
cls_st = trainer.train_self_training_classifier(base_model=lg)

In [13]:
pred, acc = trainer.predict_self_training_classifier(cls_st)

In [14]:
print(acc)

0.8731481481481481


#### KNN as Base Model for Self Training Classifier

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
knn = KNeighborsClassifier(n_neighbors=5)

In [19]:
cls_st_knn = trainer.train_self_training_classifier(base_model=knn)

In [20]:
pred_knn, acc_knn = trainer.predict_self_training_classifier(cls_st_knn)

In [21]:
print(acc_knn)

0.6589506172839507


#### RandomForest as Base Model for Self Training Classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
rfc = RandomForestClassifier(n_estimators=250)

In [25]:
cls_st_rfc = trainer.train_self_training_classifier(base_model=rfc)

In [26]:
pred_rfc, acc_rfc = trainer.predict_self_training_classifier(cls_st_rfc)

In [27]:
print(acc_rfc)

0.7981481481481482


#### GaussianNB as Base Model for Self Training Classifier

In [10]:
from sklearn.naive_bayes import GaussianNB

In [11]:
nb = GaussianNB(var_smoothing=0)

In [12]:
cls_st_nb = trainer.train_self_training_classifier(base_model=nb)

In [13]:
pred_nb, acc_nb = trainer.predict_self_training_classifier(cls_st_nb)

In [14]:
print(acc_nb)

0.19907407407407407
