In [1]:
import pandas as pd
import numpy as np

from preprocess import preprocess_text

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jaskaransingh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jaskaransingh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to punkt...
[nltk_data]   Package wordnet is already up-to-date!


#### Importing labelled and unlabelled data

In [2]:
labelled_data = pd.read_csv("data/labelled/labelled_data.csv")
unlabelled_data = pd.read_csv("data/unlabelled_data/unlabelled_data_json.csv")

#### For Labelled Data

In [3]:
df = labelled_data[['title', 'abstract','categories']]

In [4]:
df['title'] = preprocess_text(df, 'title')
df['abstract'] = preprocess_text(df, 'abstract')

#### For Unlabelled Data

In [5]:
df_unlab = unlabelled_data[['title', 'abstract']]

In [6]:
df_unlab['title'] = preprocess_text(df_unlab, 'title')
df_unlab['abstract'] = preprocess_text(df_unlab, 'abstract')

In [7]:
unlab_x = df_unlab[['title', 'abstract']].agg(' '.join, axis=1)

#### Creating Training Data

In [8]:
x = df[['title', 'abstract']].agg(' '.join, axis=1)
y = df["categories"]

In [9]:
from sklearn.model_selection import train_test_split as tts

train_x,test_x,train_y,test_y = tts(x,y,test_size=0.2,stratify=y)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(stop_words = 'english', max_features=1000)

tfidf_train = tfidf_vect.fit_transform(train_x)
tfidf_test = tfidf_vect.transform(test_x)

tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vect.get_feature_names())

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lg = LogisticRegression(solver='lbfgs', max_iter=1000)

lg.fit(tfidf_train, train_y)
pred = lg.predict(tfidf_test)

print(accuracy_score(test_y,pred))

0.8666666666666667


In [12]:
unlab_df = pd.Series(unlab_x,
              name="text")

unlab_df = unlab_df.to_frame(name="text")

In [13]:
from imblearn.over_sampling import BorderlineSMOTE

sm2 = BorderlineSMOTE(random_state=42)
tfidf_train, train_y = sm2.fit_resample(tfidf_train, train_y)

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded_y = le.fit_transform(train_y)
test_encoded_y = le.fit_transform(test_y)

In [16]:
tfidf_train.shape

(9870, 1000)

In [17]:
tfidf_train.shape
tfidf_unlab = tfidf_vect.transform(unlab_x)

tfidf_train_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vect.get_feature_names_out())
tfidf_unlab_df = pd.DataFrame(tfidf_unlab.A, columns=tfidf_vect.get_feature_names_out())

print(tfidf_train_df.shape, tfidf_unlab_df.shape)

(9870, 1000) (32396, 1000)


In [18]:
tfidf_unlab.shape

(32396, 1000)

In [19]:
X_train_mixed = tfidf_train_df.append(tfidf_unlab_df, ignore_index=True)

nolabel = [-1 for _ in range(len(unlab_x))]
y_train_mixed = np.concatenate((encoded_y, nolabel))

In [20]:
X_train_mixed.shape

(42266, 1000)

In [21]:
len(y_train_mixed)

42266

In [None]:
# importing the module
from sklearn.semi_supervised import LabelPropagation

# define model
model = LabelPropagation()

# fit model on training dataset
model.fit(X_train_mixed, y_train_mixed)

In [None]:
pred = model.predict(tfidf_test)

In [None]:
print(f'The accuracy using Self Training Classifier is {accuracy_score(test_encoded_y, pred)}')

In [None]:
pd.Series(y_train_mixed).value_counts()

In [None]:
from sklearn.linear_model import LogisticRegression

tfidf_semi_sup = tfidf_vect.transform(X_train_mixed)
semi_df = pd.DataFrame(tfidf_semi_sup.A, columns=tfidf_vect.get_feature_names_out())

lg = LogisticRegression(solver='lbfgs', max_iter=1000)

# importing selftraining classifier
from sklearn.semi_supervised import SelfTrainingClassifier

# Specify Self-Training model parameters
self_training_model = SelfTrainingClassifier(base_estimator=lg)

# Fit the model
clf_ST = self_training_model.fit(semi_df, y_train_mixed)

In [None]:
pred_self_training = clf_ST.predict(tfidf_test)

In [None]:
print(f'The accuracy using Self Training Classifier is {accuracy_score(test_encoded_y, pred_self_training)}')

In [3]:
from semi_supervised import Semisupervised

In [4]:
trainer = Semisupervised(labelled_data, unlabelled_data)

In [5]:
cls_st = trainer.train_self_training_classifier()

In [6]:
pred, acc = trainer.predict_self_training_classifier(cls_st)

In [7]:
print(acc)

0.8820987654320988
