In [44]:
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import random
import math
import os

In [45]:
section_feat = pickle.load(open("../pickles/en_section_feature.pkl", "rb"))
popular = pickle.load(open("../pickles/en_popularity.pkl", "rb"))
dens = pickle.load(open("../pickles/en_density.pkl", "rb"))
sync = pickle.load(open("../pickles/en_synctatic_feat.pkl","rb"))
text_feat = pickle.load(open("../pickles/en_text_feat.pkl", "rb"))
tags = pickle.load(open("../../pickles_data/baseline_tags.pkl", "rb"))

In [46]:
ids = list(section_feat.keys())

In [47]:
len(ids)

1181

In [48]:
dataset = []
for key in ids :
    papers = section_feat[key]
    for paper1 in papers :
        data = {}
        section_list = []
        for section in paper1['section_feature'] :
            section_list.append(paper1['section_feature'][section])
        data['section_feat'] = section_list
        
        paper = paper1['paper_name']
        for pap in sync[key] :
            if(pap['paper_name']==paper) :
                bool_list = pap['pos_feat']
                feat_list = []
                for val in bool_list :
                    if(val):
                        feat_list.append(1)
                    else :
                        feat_list.append(0)
                data['sync_feat'] = feat_list
                break
        
        for pap in popular[key] :
            if(pap['paper_name']==paper) :
                data['popularity'] = pap['popularity']
                break
                
        for pap in dens[key] :
            if(pap['paper_name']==paper) :
                data['density'] = pap['density']
                break
            
        for pap in text_feat[key] :
            if(pap['paper_name']==paper) :
                data['text_feat'] = []
                if(pap['weight_comp']>0):
                    data['text_feat'].append(1)
                else :
                    data['text_feat'].append(0)
                
                if(pap['weight_result']>0):
                    data['text_feat'].append(1)
                else :
                    data['text_feat'].append(0)
                    
                    
                if(pap['weight_subject']>0):
                    data['text_feat'].append(1)
                else :
                    data['text_feat'].append(0)
                    
                data['text_feat'].append(pap['weight_comp'])
                data['text_feat'].append(pap['weight_result'])
                data['text_feat'].append(pap['weight_subject'])
                break
                
        for pap in tags[key]:
            if(pap['paper_name']==paper) :
                if(pap['tag']==1):
                    data['label'] = 'baseline'
                else :
                    data['label'] = 'non_baseline'
                break
                
        dataset.append(data)

In [49]:
len(dataset)

31943

In [50]:
for data in dataset :
    print(len(data.keys()))
    break
    if(len(data.keys())!=6) :
        print(data)

6


In [51]:
values = []
output = []
for data in dataset :
    ar = []
    ar.append(data['density'])
    ar.append(data['popularity'])    
    ar.extend(data['section_feat'])
    ar.extend(data['text_feat'])
    ar.extend(data['sync_feat'])
    values.append(ar)
    if(data['label']=='baseline'):
        output.append(1)
    else :
        output.append(0)

In [54]:
values, output = shuffle(values, output)

In [55]:
values = np.array(values)
output = np.array(output)

In [56]:
print(values.shape)

(5906, 22)


In [57]:
scaler = MinMaxScaler((0,1))
values = scaler.fit_transform(values)

In [58]:
def split(data, output):
    n = len(data)
    last = int(0.8*n)
    train_data = data[:last]
    train_output = output[:last]
    test_data = data[last:]
    test_output = output[last:]
    return train_data, test_data, train_output, test_output

In [59]:
train_data, test_data, train_output, test_output = split(values, output)

In [60]:
def shuffle(train_data, train_output) :
    baselines = []
    non_baselines = []
    for i in range(len(train_output)) :
        if(train_output[i]==1) :
            baselines.append(train_data[i])
        else :
            non_baselines.append(train_data[i])
    
    n = len(baselines)
    ar = np.random.choice(len(non_baselines), len(baselines))
    nb_ar = []
    for x in ar :
        nb_ar.append(non_baselines[x])
        
    data = []
    data.extend(nb_ar)
    output = []
    for i in range(len(data)):
        output.append(0)
    
    data.extend(baselines)
    for i in range(n) :
        output.append(1)
        
    total_data = []
    for i in range(len(data)):
        ar = []
        ar.append(data[i])
        ar.append(output[i])
        total_data.append(ar)
        
    total_data = np.array(total_data)
    np.random.shuffle(total_data)
    
    data = []
    output = []
    for ar in total_data :
        data.append(ar[0])
        output.append(ar[1])
        
    data = np.array(data)
    output = np.array(output)
    
    return data, output

In [31]:
# train_data, train_output = shuffle(train_data, train_output)

In [61]:
params = {'C':[0.1, 0.5, 1, 2]}
modelin = SVC(kernel='rbf', max_iter=1e4, gamma='scale')
model = GridSearchCV(modelin, params, cv=2, n_jobs=5)
# model = LogisticRegression(solver='lbfgs',max_iter=1e4, n_jobs=5, random_state=1, warm_start=False, C=1, fit_intercept=True)
clf = model.fit(train_data, train_output)
predict_test = clf.predict(test_data)
print(classification_report(test_output, predict_test))
predictions = clf.predict(train_data)
print(classification_report(train_output, predictions))
print(model.best_params_)
# print(type(model.best_estimator_))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79       586
           1       0.84      0.69      0.76       596

    accuracy                           0.78      1182
   macro avg       0.79      0.78      0.78      1182
weighted avg       0.79      0.78      0.78      1182

              precision    recall  f1-score   support

           0       0.76      0.87      0.81      2367
           1       0.85      0.72      0.78      2357

    accuracy                           0.80      4724
   macro avg       0.80      0.80      0.79      4724
weighted avg       0.80      0.80      0.79      4724

{'C': 2}


In [66]:
xtrain, xval, ytrain, yval = train_test_split(train_data, train_output, test_size=0.3, random_state=42)
print(len(xtrain))
print(len(xval))
print(len(train_data))

3334
1430
4764


In [67]:
print(xtrain.shape)
print(type(ytrain))

(3334, 22)
<class 'numpy.ndarray'>


In [68]:
for i in range(10) :
    model1 = GaussianNB()
    model2 = SVC()
    model3 = LogisticRegression()
    print(i)
    model1.fit(xtrain, ytrain)
    model2.fit(xtrain, ytrain)
    model3.fit(xtrain, ytrain)
    pred1 = model1.predict(xval)
    pred2 = model2.predict(xval)
    pred3 = model3.predict(xval)
    final_pred = []
    for j in range(len(xval)) :
        if(pred1[j]+pred2[j]+pred3[j]>=2) :
            final_pred.append(1)
        else :
            final_pred.append(0)
            
    extra1 = xval[143*i:143*(i+1)]
    extra2 = final_pred[143*i:143*(i+1)]
    xtrain = list(xtrain)
    ytrain = list(ytrain)
    for val in extra1 :
        xtrain.append(val)
    for val in extra2 :
        ytrain.append(val)
    xtrain = np.array(xtrain)
    ytrain = np.array(ytrain)
    print(xtrain.shape)
    print(ytrain.shape)
    xval = xval[143:]
    yval = yval[143:]
    print(xval.shape)
    print(yval.shape)
    
    
    predt1 = model1.predict(test_data)
    predt2 = model2.predict(test_data)
    predt3 = model3.predict(test_data)
    final_predt = []
    for k in range(len(test_data)) :
        if(predt1[k]+predt2[k]+predt3[k]>=2) :
            final_predt.append(1)
        else :
            final_predt.append(0)
    
    print(classification_report(test_output, final_predt))

0
(3477, 22)
(3477,)
(1287, 22)
(1287,)
              precision    recall  f1-score   support

           0       0.96      0.86      0.91      5818
           1       0.32      0.68      0.44       571

    accuracy                           0.84      6389
   macro avg       0.64      0.77      0.67      6389
weighted avg       0.91      0.84      0.87      6389

1
(3620, 22)
(3620,)
(1144, 22)
(1144,)
              precision    recall  f1-score   support

           0       0.96      0.86      0.91      5818
           1       0.32      0.68      0.43       571

    accuracy                           0.84      6389
   macro avg       0.64      0.77      0.67      6389
weighted avg       0.91      0.84      0.87      6389

2
(3763, 22)
(3763,)
(1001, 22)
(1001,)
              precision    recall  f1-score   support

           0       0.96      0.86      0.91      5818
           1       0.32      0.68      0.43       571

    accuracy                           0.84      6389
   macro