In [1]:
import pandas as pd
import numpy as np

from preprocess import preprocess_text

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jaskaransingh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jaskaransingh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to punkt...
[nltk_data]   Package wordnet is already up-to-date!


#### Importing labelled and unlabelled data

In [2]:
labelled_data = pd.read_csv("data/labelled/labelled_data.csv")
unlabelled_data = pd.read_csv("data/unlabelled_data/unlabelled_data_json.csv")

#### For Labelled Data

In [3]:
df = labelled_data[['title', 'abstract','categories']]

In [4]:
df['title'] = preprocess_text(df, 'title')
df['abstract'] = preprocess_text(df, 'abstract')

#### For Unlabelled Data

In [5]:
df_unlab = unlabelled_data[['title', 'abstract']]

In [6]:
df_unlab['title'] = preprocess_text(df_unlab, 'title')
df_unlab['abstract'] = preprocess_text(df_unlab, 'abstract')

In [7]:
unlab_x = df_unlab[['title', 'abstract']].agg(' '.join, axis=1)

#### Creating Training Data

In [8]:
x = df[['title', 'abstract']].agg(' '.join, axis=1)
y = df["categories"]

In [9]:
from sklearn.model_selection import train_test_split as tts

train_x,test_x,train_y,test_y = tts(x,y,test_size=0.2,stratify=y)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(stop_words = 'english')

tfidf_train = tfidf_vect.fit_transform(train_x)
tfidf_test = tfidf_vect.transform(test_x)

tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vect.get_feature_names())

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lg = LogisticRegression(solver='lbfgs', max_iter=1000)

lg.fit(tfidf_train, train_y)
pred = lg.predict(tfidf_test)

print(accuracy_score(test_y,pred))

0.8901234567901235


In [12]:
unlab_df = pd.Series(unlab_x,
              name="text")

unlab_df = unlab_df.to_frame(name="text")

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded_y = le.fit_transform(train_y)
test_encoded_y = le.fit_transform(test_y)

In [15]:
X_train_mixed = np.concatenate((np.array(train_x), np.array(unlab_x)))

nolabel = [-1 for _ in range(len(unlab_x))]
y_train_mixed = np.concatenate((encoded_y, nolabel))

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_semi_sup = tfidf_vect.transform(X_train_mixed)
semi_df = pd.DataFrame(tfidf_semi_sup.A, columns=tfidf_vect.get_feature_names_out())

# importing the module
from sklearn.semi_supervised import LabelPropagation

# define model
model = LabelPropagation()

# fit model on training dataset
model.fit(semi_df, y_train_mixed)

In [17]:
pred = model.predict(tfidf_test)

In [24]:
print(f'The accuracy using Self Training Classifier is {accuracy_score(test_encoded_y, pred)}')

The accuracy using Self Training Classifier is 0.778395061728395


In [19]:
pd.Series(y_train_mixed).value_counts()

-1    32396
 2     1410
 6     1410
 0     1281
 3     1138
 4      583
 5      451
 1      206
dtype: int64

In [20]:
from sklearn.linear_model import LogisticRegression

tfidf_semi_sup = tfidf_vect.transform(X_train_mixed)
semi_df = pd.DataFrame(tfidf_semi_sup.A, columns=tfidf_vect.get_feature_names_out())

lg = LogisticRegression(solver='lbfgs', max_iter=1000)

# importing selftraining classifier
from sklearn.semi_supervised import SelfTrainingClassifier

# Specify Self-Training model parameters
self_training_model = SelfTrainingClassifier(base_estimator=lg)

# Fit the model
clf_ST = self_training_model.fit(semi_df, y_train_mixed)

In [21]:
pred_self_training = clf_ST.predict(tfidf_test)

In [23]:
print(f'The accuracy using Self Training Classifier is {accuracy_score(test_encoded_y, pred_self_training)}')

The accuracy using Self Training Classifier is 0.8839506172839506
