In [1]:
import pandas as pd
import os
import re
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import MinMaxScaler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import hamming_loss
from sklearn.multioutput import MultiOutputClassifier

In [2]:
papers_and_authors = pd.read_csv('E:/OtherCodeProjects/Springboard Capstone Projects/Springboard-Capstone-1-Data/added_features_data.csv')
papers_and_authors.head()

Unnamed: 0,year,paper_text,paper_id,author_id,title_len,paper_len,avg_word_len,Oral,Poster,Spotlight,Unknown
0,1987,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,1,62,21643,4.808264,0,0,0,1
1,1987,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,2,62,21643,4.808264,0,0,0,1
2,1987,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...,10,14,98,15505,4.886807,0,0,0,1
3,1988,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...,100,155,116,20523,5.784861,0,0,0,1
4,1988,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...,100,54,116,20523,5.784861,0,0,0,1


In [3]:
top_authors = papers_and_authors.groupby(by='author_id')['paper_id'].count().sort_values(ascending=False).to_frame()
top_authors.reset_index(inplace=True)
top_authors.rename(columns={"paper_id":"paper_count"}, inplace=True)
top_authors.head()

Unnamed: 0,author_id,paper_count
0,330,101
1,1472,62
2,178,60
3,121,58
4,1020,51


In [4]:
top_authors = [330, 1472, 178, 121, 1020]

In [5]:
top_papers = papers_and_authors[papers_and_authors['author_id'].isin(top_authors)]
top_papers_duplicates = top_papers[top_papers.duplicated(subset='paper_id', keep='first')]
top_papers_duplicates.sort_values(by='paper_id')

Unnamed: 0,year,paper_text,paper_id,author_id,title_len,paper_len,avg_word_len,Oral,Poster,Spotlight,Unknown
20270,1993,Supervised learning from incomplete\ndata via ...,767,330,59,20948,5.15191,0,0,0,1
20610,1994,An Alternative Model for Mixtures of\nExperts\...,906,121,44,17802,4.856669,0,0,0,1
20618,1994,Forward dynamic models in human\nmotor control...,909,330,70,19430,5.095964,0,0,0,1
20705,1994,Computational structure of coordinate\ntransfo...,948,330,77,21404,5.326837,0,0,0,1
41,1994,Active Learning with Statistical Models\n\nDav...,1011,330,39,15512,5.052153,0,0,0,1
370,1995,Factorial Hidden Markov Models\nZoubin Ghahram...,1144,330,30,19305,5.185567,0,0,0,1
666,1996,Hidden Markov decision trees\nMichael I. Jorda...,1264,1020,28,18429,5.255123,0,0,0,1
1171,1997,Hierarchical Non-linear Factor Analysis\nand T...,1472,121,60,21648,5.091629,0,0,0,1
1301,1998,SMEM Algorithm for Mixture Models\n\nN aonori ...,1521,121,33,18686,4.903834,0,0,0,1
1788,1999,Learning to Parse Images\n\nGeoffrey E. Hinton...,1710,1020,24,19240,5.012264,0,0,0,1


In [6]:
dummies = pd.get_dummies(papers_and_authors, columns=['author_id'])
dummies.head()

Unnamed: 0,year,paper_text,paper_id,title_len,paper_len,avg_word_len,Oral,Poster,Spotlight,Unknown,...,author_id_10473,author_id_10474,author_id_10475,author_id_10476,author_id_10477,author_id_10478,author_id_10479,author_id_10480,author_id_10481,author_id_10482
0,1987,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,62,21643,4.808264,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1987,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,62,21643,4.808264,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1987,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...,10,98,15505,4.886807,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1988,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...,100,116,20523,5.784861,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1988,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...,100,116,20523,5.784861,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [7]:
agg_functions = {}

keys = dummies.columns
for i in keys:
    if re.match('author_id_\d*', i) == None:
        agg_functions[i] = 'first'
    else:
        agg_functions[i] = 'sum'

agg_functions

{'year': 'first',
 'paper_text': 'first',
 'paper_id': 'first',
 'title_len': 'first',
 'paper_len': 'first',
 'avg_word_len': 'first',
 'Oral': 'first',
 'Poster': 'first',
 'Spotlight': 'first',
 'Unknown': 'first',
 'author_id_1': 'sum',
 'author_id_2': 'sum',
 'author_id_3': 'sum',
 'author_id_4': 'sum',
 'author_id_5': 'sum',
 'author_id_6': 'sum',
 'author_id_7': 'sum',
 'author_id_8': 'sum',
 'author_id_9': 'sum',
 'author_id_10': 'sum',
 'author_id_11': 'sum',
 'author_id_12': 'sum',
 'author_id_13': 'sum',
 'author_id_14': 'sum',
 'author_id_15': 'sum',
 'author_id_16': 'sum',
 'author_id_17': 'sum',
 'author_id_18': 'sum',
 'author_id_19': 'sum',
 'author_id_20': 'sum',
 'author_id_21': 'sum',
 'author_id_22': 'sum',
 'author_id_23': 'sum',
 'author_id_24': 'sum',
 'author_id_25': 'sum',
 'author_id_26': 'sum',
 'author_id_27': 'sum',
 'author_id_28': 'sum',
 'author_id_29': 'sum',
 'author_id_30': 'sum',
 'author_id_31': 'sum',
 'author_id_32': 'sum',
 'author_id_33': 'sum',

In [8]:
df = dummies.groupby(dummies['paper_id']).aggregate(agg_functions).reset_index(drop=True)
df

Unnamed: 0,year,paper_text,paper_id,title_len,paper_len,avg_word_len,Oral,Poster,Spotlight,Unknown,...,author_id_10473,author_id_10474,author_id_10475,author_id_10476,author_id_10477,author_id_10478,author_id_10479,author_id_10480,author_id_10481,author_id_10482
0,1987,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1,62,21643,4.808264,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1987,184\n\nTHE CAPACITY OF THE KANERVA ASSOCIATIVE...,2,61,16755,4.639499,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1987,52\n\nSupervised Learning of Probability Distr...,3,67,13400,4.984227,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1987,612\n\nConstrained Differential Optimization\n...,4,37,25759,5.013431,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1987,485\n\nTOWARDS AN ORGANIZING PRINCIPLE FOR\nA ...,5,64,32874,5.024413,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7234,2017,"On Separability of Loss Functions, and Revisit...",7280,85,34859,4.725678,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7235,2017,Maxing and Ranking with Few Assumptions\nMoein...,7281,39,36243,4.415865,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7236,2017,On clustering network-valued data\n\nSoumendu ...,7282,33,36121,4.940303,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7237,2017,A General Framework for Robust Interactive\nLe...,7283,51,40083,4.772216,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
tested_authors = [1472, 178]
tested_authors_str = []
for author in tested_authors:
    tested_authors_str.append(f'author_id_{author}')
tested_authors_str

['author_id_1472', 'author_id_178']

In [10]:
exclude = tested_authors_str.__add__(['paper_id'])

X = df[df.columns.difference(exclude)]

y = df[tested_authors_str]

In [11]:
scaled_feat = X[X.columns.difference(['paper_text'])]
paper_text = pd.DataFrame(X.paper_text)

minmax = MinMaxScaler()
scaled_feat = pd.DataFrame(minmax.fit_transform(scaled_feat), columns=scaled_feat.columns)
X = scaled_feat.join(paper_text)

In [23]:
X_train = None
X_test = None
y_train = None
y_test = None

def vector_train_test(X, y, rand_state): 

    """A function that takes two arrays X and y and splits them into a train and test set.
    It also vectorizes the text input with TfidfVectorizer, and sets the global variables
    X_train, X_test, y_train, y_test to the relevant sparse matrices"""

    global X_train; global X_test; global y_train; global y_test
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rand_state, train_size=0.75, stratify=y)

    other_features_train = csr_matrix(X_train[X_train.columns.difference(['paper_text'])].values)
    other_features_test = csr_matrix(X_test[X_test.columns.difference(['paper_text'])].values)

    tfidf = TfidfVectorizer()

    X_train_vector = tfidf.fit_transform(X_train.paper_text)
    X_test_vector = tfidf.transform(X_test.paper_text)

    X_train = hstack([other_features_train, X_train_vector])
    X_test = hstack([other_features_test, X_test_vector])

    X_train = np.asarray(csr_matrix.todense(X_train))
    X_test = np.asarray(csr_matrix.todense(X_test))
    y_train = pd.DataFrame.to_numpy(y_train)
    y_test = pd.DataFrame.to_numpy(y_test)


In [24]:
vector_train_test(X=X, y=y, rand_state=11)

In [25]:
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
y_test

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 1],
       [0, 0],
       [0, 0]], dtype=uint8)

In [27]:
#pipe = Pipeline(steps=[
            #('ros', RandomOverSampler(random_state=21)),
            #('LR', LogisticRegression(C=1, max_iter=1000))
            #])

In [28]:
#pipe.fit(X_train, y_train)
#pipe_train_pred = pipe.predict(X_train)
#pipe_test_pred = pipe.predict(X_test)

In [29]:
LR = LogisticRegression(C=1, max_iter=1000)

multi_output_LR = MultiOutputClassifier(LR)
multi_output_LR.fit(X_train, y_train)

prediction = multi_output_LR.predict(X_test)

In [30]:
prediction

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 0],
       [0, 0],
       [0, 0]], dtype=uint8)

In [31]:
h_loss = hamming_loss(y_test, prediction)
h_loss

0.008011049723756906