In [1]:
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from collections import Counter
from numpy import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

# Download files and set up folder

In [2]:
#download dataset from "https://www.kaggle.com/competitions/sp23-inls690-270-citation-recommendation/data"
training_data_path = './reference_metadata_2013.csv'
test_data_path = './reference_metadata_2020.csv'

In [3]:
# specify data type for each column (to be used in pandas read_csv function)
dtype_dict = {'REFERENCE_ID': str, 'TITLE': str, 'AUTHOR': str, 'YEAR': str, 'ABSTRACT': str, 'CITED': int}

In [4]:
dataframe = pd.read_csv(training_data_path, dtype = dtype_dict, keep_default_na = False)
#dataframe

In [5]:
test_dataframe = pd.read_csv(test_data_path, dtype = dtype_dict, keep_default_na = False)
#test_dataframe

# Feature scaling and hyperparameters tuning

In [6]:
all_train_Y = dataframe['CITED']
all_train_X = dataframe['TITLE']

In [7]:
# feature scaling

from sklearn.preprocessing import MaxAbsScaler

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(all_train_X)

scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [8]:
# hyperparameters tuning

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
logreg = LogisticRegression()
grid_search = GridSearchCV(logreg, param_grid, cv=5)
grid_search.fit(X_train_scaled, all_train_Y)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters:  {'C': 0.1, 'penalty': 'l2'}
Best cross-validation score:  0.8692620243255114


In [9]:
# train the model

model = LogisticRegression(penalty='l2', C = 0.1, solver='liblinear')
model.fit(X_train_scaled, all_train_Y)

LogisticRegression(C=0.1, solver='liblinear')

In [10]:
# evaluation on all train set

train_Y_hat = model.predict_proba(X_train_scaled)
train_Y = dataframe['CITED'].to_numpy()
ap = average_precision_score(train_Y, train_Y_hat[:,1])
print ('Logistic regression, average precision on training set:', ap)

Logistic regression, average precision on training set: 0.5093496852901724


In [11]:
# get the result from test set

test_X = vectorizer.transform(test_dataframe['TITLE'])
X_test_scaled = scaler.transform(test_X)
test_Y_hat = model.predict_proba(X_test_scaled)
print(test_Y_hat)

[[0.96252869 0.03747131]
 [0.91485569 0.08514431]
 [0.8655341  0.1344659 ]
 ...
 [0.88875631 0.11124369]
 [0.87813092 0.12186908]
 [0.90435106 0.09564894]]


In [12]:
# output the result file

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('REFERENCE_ID', 'Score'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['REFERENCE_ID'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

In [13]:
write_test_prediction(test_dataframe, test_Y_hat[:,1], './logit_reg_1.csv')

171376 predictions are written to ./logit_reg_1.csv


# Feature selection 

In [14]:
# Feature selection

from sklearn.feature_selection import SelectKBest, chi2

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(all_train_X)

selector = SelectKBest(chi2, k=12000)
selector.fit(X_train, all_train_Y)
X_train_new = selector.transform(X_train)

scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train_new)

In [15]:
# train the model and evaluation on all train set

model = LogisticRegression(C = 0.1, solver='liblinear')
model.fit(X_train_scaled, all_train_Y)

train_Y_hat = model.predict_proba(X_train_new)
train_Y = dataframe['CITED'].to_numpy()
ap = average_precision_score(train_Y, train_Y_hat[:,1])
print ('Logistic regression, average precision on training set:', ap)

Logistic regression, average precision on training set: 0.4927160558246138


In [16]:
# get the result from test set

test_X = vectorizer.transform(test_dataframe['TITLE'])
test_X_new = selector.transform(test_X)
test_Y_hat = model.predict_proba(test_X_new)
print(test_Y_hat)

[[0.93699988 0.06300012]
 [0.91462164 0.08537836]
 [0.88238835 0.11761165]
 ...
 [0.89291952 0.10708048]
 [0.89720867 0.10279133]
 [0.89235193 0.10764807]]


In [17]:
# output the result file

write_test_prediction(test_dataframe, test_Y_hat[:,1], './logit_reg_2.csv')

171376 predictions are written to ./logit_reg_2.csv


# Model ensembling 

In [18]:
# equally split into train and validate set

train_ratio = 0.5 # 50% for training, 30% for validation
random_seed = 100

train_dataframe = dataframe.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = dataframe.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

training set size: 7886
validation set size: 7886


In [19]:
# train on model1 (the train set) and get the prediction1

from sklearn.preprocessing import MaxAbsScaler

vectorizer = TfidfVectorizer()
train_X = vectorizer.fit_transform(train_dataframe['TITLE'])

scaler = MaxAbsScaler()
train_X_scaled = scaler.fit_transform(train_X)

train_Y = train_dataframe['CITED']

test_X1 = vectorizer.transform(test_dataframe['TITLE'])
X_test_scaled1 = scaler.transform(test_X1)

print(train_X_scaled.shape)
print(X_test_scaled1.shape)

(7886, 9114)
(171376, 9114)


In [20]:
model = LogisticRegression(penalty='l2', C = 0.1, solver='liblinear')
model.fit(train_X_scaled, train_Y)

test_Y_hat1 = model.predict_proba(X_test_scaled1)
print(test_Y_hat1)

[[0.9508395  0.0491605 ]
 [0.91727432 0.08272568]
 [0.87550741 0.12449259]
 ...
 [0.87330112 0.12669888]
 [0.89491425 0.10508575]
 [0.90589112 0.09410888]]


In [21]:
# train on model2 (the validate set) and get the prediction2

valid_X = vectorizer.fit_transform(valid_dataframe['TITLE'])
valid_X_scaled = scaler.fit_transform(valid_X)
valid_Y = valid_dataframe['CITED']

test_X2 = vectorizer.transform(test_dataframe['TITLE'])
X_test_scaled2 = scaler.transform(test_X2)

print(valid_X_scaled.shape)
print(X_test_scaled2.shape)

(7886, 9072)
(171376, 9072)


In [22]:
model = LogisticRegression(penalty='l2', C = 0.1, solver='liblinear')
model.fit(valid_X_scaled, valid_Y)

test_Y_hat2 = model.predict_proba(X_test_scaled2)
print(test_Y_hat2)

[[0.95191374 0.04808626]
 [0.89756716 0.10243284]
 [0.86021421 0.13978579]
 ...
 [0.8671971  0.1328029 ]
 [0.85554922 0.14445078]
 [0.8795475  0.1204525 ]]


In [23]:
# get the average prediction

predictions = [test_Y_hat1, test_Y_hat2]
avg_predictions = np.mean(predictions, axis=0)
print(avg_predictions)

[[0.95137662 0.04862338]
 [0.90742074 0.09257926]
 [0.86786081 0.13213919]
 ...
 [0.87024911 0.12975089]
 [0.87523173 0.12476827]
 [0.89271931 0.10728069]]


In [24]:
# output the result file

write_test_prediction(test_dataframe, avg_predictions[:,1], './logit_reg_3.csv')

171376 predictions are written to ./logit_reg_3.csv


# Ridge regression

In [26]:
from sklearn.linear_model import RidgeClassifier
from sklearn.calibration import CalibratedClassifierCV

vectorizer = TfidfVectorizer()
vectorizer.fit(dataframe['TITLE'])
all_train_X = vectorizer.transform(dataframe['TITLE'])
all_train_Y = dataframe['CITED']

# use the calibrated model to get the predict_proba 
model = RidgeClassifier()
model.fit(all_train_X, all_train_Y)
calibrated_model = CalibratedClassifierCV(model, cv='prefit')
calibrated_model.fit(all_train_X, all_train_Y)

test_X = vectorizer.transform(test_dataframe['TITLE'])
test_Y_hat = calibrated_model.predict_proba(test_X)
print(test_Y_hat)

[[9.99487108e-01 5.12892249e-04]
 [5.66978286e-01 4.33021714e-01]
 [9.93852204e-01 6.14779595e-03]
 ...
 [9.94385331e-01 5.61466928e-03]
 [7.64634999e-01 2.35365001e-01]
 [9.95750368e-01 4.24963246e-03]]


In [27]:
write_test_prediction(test_dataframe, test_Y_hat[:,1], './ridge_regression.csv')

171376 predictions are written to ./ridge_regression.csv
