In [1]:
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV
import random
import math
import os

In [2]:
import re
import pickle
import json
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import os
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import math
from difflib import SequenceMatcher
import pandas as pd

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
year_regex = re.compile(r'((19[0-9]{2})|(20[0-9]{2}))[a-z]?')
conversion_dict = {}

In [3]:
stop_words = [',', '.', '(', ')', ':', '-', "+", ";", "a", "about", "al", "al.", "all", 
	"already", "also", "although", "am", "an", "and", "another", "any", "anyhow", "are", 
	"aren", "aren't", "around", "as", "at", "back", "be", "because", "been", 
	"being", "beyond", "but", "by", "can", "cannot", "cant", "co", "con", "could", "couldn", 
	"couldnt", "d", "de", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", 
	"doing", "don", "don't", "done", "due", "each", "either", "else", "elsewhere", "et", 
	"etc", "even", "ever", "except", "for", "found", "from", "further", "had", "hadn", 
	"hadn't", "has", "hasn", "hasn't", "hasnt", "have", "haven", "haven't", "having", 
	"he", "hence", "her", "here", "hereafter", "hereby", "hers", 
	"herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "inc", 
	"indeed", "interest", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", 
	"just", "ltd", "ll", "m", "may", "me", "meanwhile", "might", "mightn", 
	"mightn't", "mine", "moreover", "most", "mostly", "move", "much", "must", "mustn", 
	"mustn't", "my", "myself", "name", "namely", "need", "needn", "needn't", "neither", 
	"nevertheless", "no", "nobody", "noone", "nor", "not", "now", "nowhere", "o", "of", 
	"off", "often", "on", "only", "onto", "or", "other", "others", "otherwise", "our", "own", 
	"per", "perhaps", "put", "rather", "re", "s", "same", "see", "seem", "seemed", 
	"seeming", "seems", "serious", "she", "should", "shouldn", "shouldn't", "since", 
	"sincere", "so", "some", "somehow", "someone", "something", "somewhere", "still", 
	"such", "t", "take", "than", "that", "that'll", "the", "their", "theirs", 
	"them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", 
	"therein", "thereupon", "these", "they", "this", "those", "though", "throughout", 
	"thru", "thus", "to", "together", "too", "toward", "towards", "un", "until", "upon", 
	"us", "ve", "very", "via", "was", "wasn", "wasn't", "we", "well", "were", "weren", 
	"weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", 
	"whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", 
	"whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", 
	"without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "your", 
	"yours", "yourself", "yourselves", "from SVM import SVCone", "two", "three", "four", "five", "six", "seven",
	"eight", "nine", "zero", "between", 'below', 'ourselves', "you'll", 'again', 'once', 'over', 'shan', 'few', 
    'against', 'before', 'out', 'down', 'both', 'up', "you've", "shan't", "you're", "should've", 'ours', 'ma', 
    "couldn't", 'during', 'more', 'ain', 'through', 'after', 'above', "she's", "you'd", 'under' ]

In [4]:
def find_common_words(paper_title, cit_title):

	paper_title_words = word_tokenize(paper_title.replace('-', ' ').lower())
	cit_title_words = word_tokenize(cit_title.replace('-', ' ').lower())

	final_paper_title_words = []
	for word in paper_title_words:
		if word in stop_words:
			continue
		elif re.fullmatch(r'[a-z]+', word):
			word = lemmatizer.lemmatize(word)
			final_word = stemmer.stem(word)
			word = final_word

			final_paper_title_words.append(word)

	final_cit_title_words = []
	for word in cit_title_words:
		if word in stop_words:
			continue
		elif re.fullmatch(r'[a-z]+', word):
			word = lemmatizer.lemmatize(word)
			final_word = stemmer.stem(word)
			word = final_word

			final_cit_title_words.append(word)

	return len(set(final_cit_title_words)&set(final_paper_title_words))/(len(set(final_paper_title_words)|set(final_cit_title_words)))

In [5]:
def get_abstract_similarity() :
    abstracts = pickle.load(open("pickles_data/abstracts_total.pkl", "rb"))
    paper_info = pickle.load(open("pickles_data/paper_info_partial_mapping.pickle","rb"))
    ids_consider = pickle.load(open("pickles_data/arnet_exp_keys.pkl", "rb"))
    
    dataset = {}
    for key in paper_info :
        papers = paper_info[key]
        abstract = abstracts[key]
        data = []
        for paper in papers :
            try :
                abcit = "".join(list(paper['abstract']['InvertedIndex'].keys()))
                sim = find_common_words(abcit, abstract)
                dict1 = {}
                dict1['paper_name'] = paper['paper_name']
                dict1['abs_sim'] = sim
                data.append(dict1)
            except :
                pass
        dataset[key] = data
    return dataset

In [6]:
abs_sim = get_abstract_similarity()

In [7]:
location_feature = pickle.load(open("pickles_data/location_feature.pkl","rb"))
title_overlap = pickle.load(open("pickles_data/title_overlap.pkl","rb"))
context_count = pickle.load(open("pickles_data/context_count.pkl","rb"))
cue_words = pickle.load(open("pickles_data/cue_count.pkl", "rb"))
tags = pickle.load(open("pickles_data/baseline_tags.pkl", "rb"))
contexts = pickle.load(open("pickles_data/context_words.pkl", "rb"))
popularity = pickle.load(open("pickles_data/popularity_sent.pkl", "rb"))

In [8]:
year_diff = pickle.load(open("pickles_data/year_diff.pkl","rb"))

In [9]:
fixed_context = pickle.load(open("pickles_data/fixed_context.pkl","rb"))

In [10]:
weighted_cue = pickle.load(open("pickles_data/weighted_cue_words.pkl", "rb"))
bert_embeddings = pickle.load(open("pickles_data/bert_embeddings_relu.pkl", "rb"))

In [11]:
print(len(context_count.keys()))

1181


In [12]:
num_table = pickle.load(open("pickles_data/num_table.pkl","rb"))

In [13]:
cit_titles = pickle.load(open("pickles_data/citation_titles.pkl","rb"))
ids = tags.keys()
ids = list(set(ids).intersection(set(num_table.keys())))

In [14]:
ids_consider = pickle.load(open("pickles_data/arnet_exp_keys.pkl", "rb"))

In [15]:
paper_info = pickle.load(open("pickles_data/paper_info_partial_mapping.pickle", "rb"))

In [16]:
paper_info["P14-1066"]

[{'paper_name': 'joint language and translation modeling with recurrent neural networks.',
  'citation': 129,
  'abstract': {'IndexLength': 133,
   'InvertedIndex': {'We': [0, 51, 92],
    'present': [1],
    'a': [2, 10, 38, 56, 71, 83, 110],
    'joint': [3, 67],
    'language': [4, 47, 77],
    'and': [5, 26, 61, 123],
    'translation': [6, 49],
    'model': [7, 35, 68, 78, 101],
    'based': [8, 18],
    'on': [9, 19, 70, 113, 127],
    'recurrent': [11, 74],
    'neural': [12, 75],
    'network': [13, 76],
    'which': [14],
    'predicts': [15],
    'target': [16, 27],
    'words': [17],
    'an': [20],
    'unbounded': [21],
    'history': [22],
    'of': [23, 33, 85, 109],
    'both': [24],
    'source': [25, 90],
    'words.': [28],
    'The': [29],
    'weaker': [30],
    'independence': [31],
    'assumptions': [32],
    'this': [34, 53],
    'result': [36],
    'in': [37],
    'vastly': [39],
    'larger': [40],
    'search': [41],
    'space': [42],
    'compared': [43, 9

In [17]:
dataset = []
c = 0
cbert = 0
for key in ids_consider :
    papers = cit_titles[key]
    for paper in papers :
        data = {}
        data['paper_id'] = key
        data['citation_title'] = paper
        
        if(key in bert_embeddings):
            embedding_found = 0
            for pap in bert_embeddings[key] :
                if(pap['paper_name']==paper) :
                    cbert+=1
                    data['bert_embed'] = pap['embedding']
                    embedding_found = 1
                    break
                    
            if(embedding_found==1) :
                info_found = 0
                for pap in abs_sim[key] :
                    if(pap['paper_name']==paper) :
                        data['abs_sim'] = pap['abs_sim']
                        info_found = 1
                        break
                if(info_found==1) :
                    for pap in paper_info[key] :
                        if(pap['paper_name']==paper) :
                            data['citation_count'] = pap['citation']
                            try :
                                data['citation_count'] = int(data['citation_count'])
                            except :
                                data['citation_count'] = 0
                    for pap in location_feature[key]:
                        if(pap['paper_name']==paper) :
                            data['location_feature'] = pap['location_feature']
                            break

                    for pap in num_table[key]:
                        if(pap['paper_name']==paper) :
                            data['num_table'] = pap['num_table']
                            break


                    for pap in year_diff[key]:
                        if(pap['paper_name']==paper) :
                            data['year_diff'] = pap['diff']
                            break

                    for pap in title_overlap[key]:
                        if(pap['paper_name']==paper) :
                            data['title_overlap'] = pap['overlap']
                            break

                    for pap in popularity[key]:
                        if(pap['paper_name']==paper) :
                            data['popularity'] = pap['popularity']
                            break

                    for pap in context_count[key]:
                        if(pap['paper_name']==paper) :
                            data['context_count'] = pap['context_count']
                            break

                    for pap in contexts[key] :
                        if(pap['paper_name']==paper) :
                            data['context'] = pap['context']
                            break

                    for pap in cue_words[key]:
                        if(pap['paper_name']==paper) :
                            data['cue_count'] = pap['cue_count']
                            break

                    for pap in weighted_cue[key]:
                        if(pap['paper_name']==paper) :
                            max_data = pap['cue_weights_max']
                            data['wcue_max'] = []
                            for key1 in max_data :
                                data['wcue_max'].append(max_data[key1])
                            add_data = pap['cue_weights_add']
                            data['wcue_add'] = []
                            for key1 in add_data :
                                data['wcue_add'].append(add_data[key1])
                            break

                    for pap in tags[key]:
                        if(pap['paper_name']==paper) :
                            if(pap['tag']==1):
                                data['label'] = 'baseline'
                            else :
                                data['label'] = 'non_baseline'
                            break

                    for pap in fixed_context[key]:
                        if(pap['paper_name']==paper) :
                            data['fixed_context'] = pap['fixed_context']                   
                    dataset.append(data)   
        
print(c)
print(cbert)

0
18067


In [18]:
print(len(dataset))

13166


In [19]:
final_dataset = []
for data in dataset :
    if(data['context_count']!=0):
        final_dataset.append(data)
print(len(final_dataset))

13160


In [20]:
dataset = final_dataset

In [21]:
trainset = dataset[:int(0.8*len(dataset))]
testset = dataset[int(0.8*len(dataset)):]

In [22]:
def sanity_check(probs):

	sum = 0
	for key, value in probs.items():
		sum+=value
	sum = round(sum, 7)
	return sum==1

In [23]:
def lm_model(trainset, testset):

    word_baseline_frequency = {}
    word_non_baseline_frequency = {}
    baseline_length = 0
    non_baseline_length = 0

    for i in trainset:
        # if len(i['fixed_context'])>1 and i['label']=='baseline':
        # 	continue
        for context in i['fixed_context']:
            if i['label']=='baseline':
                baseline_length+=len(context)
            else:
                non_baseline_length+=len(context)
            for word in context:
                if word not in word_baseline_frequency:
                    word_baseline_frequency[word] = 0
                    word_non_baseline_frequency[word] = 0
                if i['label']=='baseline':
                    word_baseline_frequency[word]+=1
                else:
                    word_non_baseline_frequency[word]+=1

    final_word_baseline_frequency = {}
    final_word_baseline_frequency['<unk>'] = 0
    word_non_baseline_frequency['<unk>'] = 0

    for word in word_baseline_frequency:
        if word_baseline_frequency[word]>5:	
            final_word_baseline_frequency[word] = word_baseline_frequency[word]
        else:
            final_word_baseline_frequency['<unk>'] += word_baseline_frequency[word]
            word_non_baseline_frequency['<unk>'] += word_non_baseline_frequency[word]
            del word_non_baseline_frequency[word]

    word_baseline_frequency = final_word_baseline_frequency

    word_baseline_probability = {}
    word_non_baseline_probability = {}

    for word in word_baseline_frequency:
        word_baseline_probability[word] = (word_baseline_frequency[word]+1)/(baseline_length+len(word_baseline_frequency))
        word_non_baseline_probability[word] = (word_non_baseline_frequency[word]+1)/(non_baseline_length+len(word_non_baseline_frequency))

    # print(word_baseline_probability['<unk>'])
    # print(word_non_baseline_probability['<unk>'])

    assert sanity_check(word_baseline_probability) and sanity_check(word_non_baseline_probability)

    conf_mat1 = np.zeros((2,2))
    conf_mat2 = np.zeros((2,2))

    min_p = 1

    for i in trainset+testset:
        P_b = 0
        P_nb = 0
        maxP_b = 0
        minP_nb = 1
        n_better = 0
        for context in i['fixed_context']:
            
            P_b_con = 1
            P_nb_con = 1
            for word in context:
                if word not in word_baseline_probability:
                    P_b_con*=word_baseline_probability['<unk>']
                    P_nb_con*=word_non_baseline_probability['<unk>']
                else:
                    P_b_con*=word_baseline_probability[word]
                    P_nb_con*=word_non_baseline_probability[word]
            if P_b_con>P_nb_con:
                n_better+=1
            P_b+=P_b_con
            P_nb+=P_nb_con
            maxP_b = max(P_b, P_b_con)
            minP_nb = min(P_nb, P_nb_con)
            P_b/=len(i['fixed_context'])
            P_nb/=len(i['fixed_context'])

        # if P_b>P_nb:
        # 	print('yes')

        # i['P_b'] = P_b
        # i['P_nb'] = P_nb
        min_p = min(min_p, P_b, P_nb)
        i['lmp'] = [P_b, P_nb, maxP_b, minP_nb, P_b/P_nb, maxP_b/minP_nb, n_better]

    for i in trainset:	
        # if len(i['fixed_context'])>1 and i['label']=='baseline':
        # 	testset.append(i)
        # 	continue
        i['lmp'][0]/=min_p
        i['lmp'][1]/=min_p
        i['lmp'][2]/=min_p
        i['lmp'][3]/=min_p
        if i['lmp'][0]>i['lmp'][1]:
            if i['label']=='baseline':
                conf_mat1[0, 0]+=1
            else:
                conf_mat1[0, 1]+=1
        else:
            if i['label']=='baseline':
                conf_mat1[1, 0]+=1
            else:
                conf_mat1[1, 1]+=1
        if i['lmp'][2]>i['lmp'][3]:
            if i['label']=='baseline':
                conf_mat2[0, 0]+=1
            else:
                conf_mat2[0, 1]+=1
        else:
            if i['label']=='baseline':
                conf_mat2[1, 0]+=1
            else:
                conf_mat2[1, 1]+=1

    # print(conf_mat1)
    prec = conf_mat1[0,0]/(conf_mat1[0,0]+conf_mat1[0,1])
    rec = conf_mat1[0,0]/(conf_mat1[0,0]+conf_mat1[1,0])
    print(prec, rec, 2*prec*rec/(prec+rec))
    conf_mat1 = np.zeros((2,2))

    # print(conf_mat2)
    prec = conf_mat2[0,0]/(conf_mat2[0,0]+conf_mat2[0,1])
    rec = conf_mat2[0,0]/(conf_mat2[0,0]+conf_mat2[1,0])
    print(prec, rec, 2*prec*rec/(prec+rec))
#     conf_mat2 = np.zeros((2,2))

    for i in testset:
        i['lmp'][0]/=min_p
        i['lmp'][1]/=min_p
        i['lmp'][2]/=min_p
        i['lmp'][3]/=min_p
        if i['lmp'][0]>i['lmp'][1]:
            if i['label']=='baseline':
                conf_mat1[0, 0]+=1
            else:
                conf_mat1[0, 1]+=1
        else:
            if i['label']=='baseline':
                conf_mat1[1, 0]+=1
            else:
                conf_mat1[1, 1]+=1
        if i['lmp'][2]>i['lmp'][3]:
            if i['label']=='baseline':
                conf_mat2[0, 0]+=1
            else:
                conf_mat2[0, 1]+=1
        else:
            if i['label']=='baseline':
                conf_mat2[1, 0]+=1
            else:
                conf_mat2[1, 1]+=1

    # print(conf_mat1)
    prec = conf_mat1[0,0]/(conf_mat1[0,0]+conf_mat1[0,1])
    rec = conf_mat1[0,0]/(conf_mat1[0,0]+conf_mat1[1,0])
    print(prec, rec, 2*prec*rec/(prec+rec))
    conf_mat1 = np.zeros((2,2))

    # print(conf_mat2)
    prec = conf_mat2[0,0]/(conf_mat2[0,0]+conf_mat2[0,1])
    rec = conf_mat2[0,0]/(conf_mat2[0,0]+conf_mat2[1,0])
    print(prec, rec, 2*prec*rec/(prec+rec))
    conf_mat2 = np.zeros((2,2))

In [24]:
lm_model(trainset,testset)

0.3067484662576687 0.7005253940455342 0.4266666666666666
0.26787648970747563 0.8660245183887916 0.4091849400082747
0.2803030303030303 0.6006493506493507 0.38223140495867763
0.2671277285498163 0.8524137931034482 0.4067796610169492


In [25]:
# def context_feature(trainset, testset) :
#     total_counts = {}
#     citation_frequency = {}
#     paper_frequency = {}
#     curr_doc = trainset[0]['paper_id']
#     paper_words = set([])

#     for i in trainset:
#         if i['paper_id']!=curr_doc:
#             for word in paper_words:
#                 if word not in paper_frequency:
#                     paper_frequency[word] = 0
#                 paper_frequency[word]+=1
#             paper_words = set([])
#             curr_doc = i['paper_id']
#         words = set(i['context'])
#         paper_words = paper_words.union(words)
#         for word in words:
#             if word not in total_counts:
#                 total_counts[word] = 0
#                 citation_frequency[word] = 0
#             citation_frequency[word]+=1
#         counts = {word:0 for word in words}
#         for word in i['context']:
#             counts[word]+=1
#             total_counts[word]+=1

#     for word in paper_words:
#         if word not in paper_frequency:
#             paper_frequency[word] = 0
#         paper_frequency[word]+=1

#     final_citation_frequency = {word:count for word, count in citation_frequency.items()}
#     for i in citation_frequency:
#         if paper_frequency[i]<=20:
#             del total_counts[i]
#             del final_citation_frequency[i]

#     citation_frequency = final_citation_frequency
    
#     dataset = trainset+testset
    
#     idfs = {}

#     for word in citation_frequency:
#         idfs[word] = math.log10(len(dataset)*0.8/citation_frequency[word])
    
#     total_counts['<unk>'] = 0
    
#     for i in range(len(dataset)):
#         words = set(dataset[i]['context']) & set(total_counts.keys())
#         unigram_counts = {word:0 for word in words}
#         try:	
#             del unigram_counts['<SOS>']
#             del unigram_counts['<EOS>']
#         except:
#             pass

#         for j in range(len(dataset[i]['context'])):
#             word = dataset[i]['context'][j]
#             if word not in unigram_counts:
#                 continue
#             else:
#                 unigram_counts[word]+=1-dataset[i]['distances'][j]


#         dataset[i]['unigrams'] = unigram_counts


#     total_unigram_counts = total_counts
#     features = list(total_unigram_counts.keys())
#     ngram_to_idx = {features[i]:i for i in range(len(features))}
    
#     pickle.dump(total_counts, open('pickles/total_counts.pkl', 'wb'))
#     pickle.dump(dataset, open('pickles/data_ngram.pkl', 'wb'))
#     pickle.dump(ngram_to_idx, open('pickles/ngram_to_idx.pkl', 'wb'))
#     pickle.dump(idfs, open('pickles/idfs.pkl', 'wb'))
    
#     data_mat = np.zeros((len(dataset), len(ngram_to_idx)+35+1+5+1))

#     for i in range(len(dataset)):
#         context_counts = {}
#         for word in dataset[i]['unigrams']:
#             if word not in context_counts:
#                 context_counts[word] = 0
#             context_counts[word]+=dataset[i]['unigrams'][word]
#             if word in idfs:
#                 data_mat[i, ngram_to_idx[word]] = dataset[i]['unigrams'][word]

#         dataset[i]['context_feature'] = data_mat[i, :len(ngram_to_idx)]
        
#     return dataset

In [26]:
# dataset = context_feature(trainset, testset)

In [27]:
dataset = trainset+testset

In [28]:
values = []
output = []
for data in dataset :
    ar = []
    ar.append(data['context_count'])
    ar.append(data['title_overlap'])
    ar.append(data['citation_count'])
#     ar.append(data['abs_sim'])
#     ar.append(data['cue_count'])
#     ar.append(sum(data['location_feature']))
#     ar.append(data['popularity'])
    ar.append(data['num_table'])
    ar.extend(data['location_feature'])
    ar.extend(data['lmp'])
    ar.extend(data['wcue_max'])
    ar.extend(data['bert_embed'])
    values.append(ar)
    if(data['label']=='baseline'):
        output.append(1)
    else :
        output.append(0)

In [29]:
values = np.array(values)
output = np.array(output)

In [30]:
print(values.shape)

(13160, 317)


In [31]:
# std_scale = MinMaxScaler((0,1))
# values[:,0:-6] = std_scale.fit_transform(values[:,0:-6])
scaler = MinMaxScaler((0,1))
values = scaler.fit_transform(values)

In [32]:
def split(data, output):
    n = len(data)
    last = int(0.8*n)
    train_data = data[:last]
    train_output = output[:last]
    test_data = data[last:]
    test_output = output[last:]
    return train_data, test_data, train_output, test_output

In [33]:
train_data, test_data, train_output, test_output = split(values, output)

In [34]:
def shuffle(train_data, train_output) :
    baselines = []
    non_baselines = []
    for i in range(len(train_output)) :
        if(train_output[i]==1) :
            baselines.append(train_data[i])
        else :
            non_baselines.append(train_data[i])
    
    n = len(baselines)
    ar = np.random.choice(len(non_baselines), len(baselines))
    nb_ar = []
    for x in ar :
        nb_ar.append(non_baselines[x])
        
    data = []
    data.extend(nb_ar)
    output = []
    for i in range(len(data)):
        output.append(0)
    
    data.extend(baselines)
    for i in range(n) :
        output.append(1)
        
    total_data = []
    for i in range(len(data)):
        ar = []
        ar.append(data[i])
        ar.append(output[i])
        total_data.append(ar)
        
    total_data = np.array(total_data)
    np.random.shuffle(total_data)
    
    data = []
    output = []
    for ar in total_data :
        data.append(ar[0])
        output.append(ar[1])
        
    data = np.array(data)
    output = np.array(output)
    
    return data, output

In [35]:
train_data, train_output = shuffle(train_data, train_output)

In [36]:
params = {'C':[0.1, 0.5, 1, 2], 'fit_intercept':[True, False]}
modelin = LogisticRegression(solver='lbfgs', max_iter=1e4, n_jobs=5, random_state=1, warm_start=False, C=1)
model = GridSearchCV(modelin, params, cv=2, n_jobs=5)
#model = LogisticRegression(solver='lbfgs',max_iter=1e4, n_jobs=5, random_state=1, warm_start=False, C=1, fit_intercept=True)
clf = model.fit(train_data, train_output)
predict_test = clf.predict(test_data)
print(classification_report(test_output, predict_test))
predictions = clf.predict(train_data)
print(classification_report(train_output, predictions))
print(model.best_params_)

              precision    recall  f1-score   support

           0       0.97      0.88      0.92      2324
           1       0.47      0.82      0.60       308

    accuracy                           0.87      2632
   macro avg       0.72      0.85      0.76      2632
weighted avg       0.92      0.87      0.89      2632

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1142
           1       0.93      0.91      0.92      1142

    accuracy                           0.92      2284
   macro avg       0.92      0.92      0.92      2284
weighted avg       0.92      0.92      0.92      2284

{'C': 2, 'fit_intercept': True}


In [None]:
print(sum(test_output))

In [None]:
print(sum(train_output))