In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import os
import math
import string
import time

from sklearn import feature_extraction
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import svm

In [2]:
columns =['Assessment', 'Docid', 'Title', 'Authors', 'Journal', 'ISSN', 'Year','Language', 'Abstract','Keywords']

train_df = pd.read_csv('phase1.train.shuf.tsv',sep='\t',header=None, names=columns)
dev_df = pd.read_csv('phase1.dev.shuf.tsv',sep='\t',header=None, names=columns)
test_df = pd.read_csv('phase1.test.shuf.tsv',sep='\t',header=None,names=columns)

In [3]:
train_df.head(2)

Unnamed: 0,Assessment,Docid,Title,Authors,Journal,ISSN,Year,Language,Abstract,Keywords
0,-1,hash:3f1ebe70-a242-3b43-843c-eef89284607a,Misoprostol for treating postpartum haemorrhag...,"Hofmeyr, G. J.;Ferreira, S.;Nikodem, V. C.;Man...",BMC Pregnancy and Childbirth,1471-2393,2004,eng,Background: Postpartum haemorrhage remains an ...,South Africa;adult;article;blood transfusion;c...
1,-1,hash:aa35378f-0460-37f1-b001-ac735e027333,Vitamin A supplements and diarrheal and respir...,"Fawzi, W. W.;Mbise, R.;Spiegelman, D.;Fataki, ...",J Pediatr,0022-3476,2000,eng,OBJECTIVE: To determine the effect of vitamin ...,"Child, Preschool;Diarrhea/ epidemiology;Dietar..."


In [4]:
train_df.groupby(['Assessment']).count()

Unnamed: 0_level_0,Docid,Title,Authors,Journal,ISSN,Year,Language,Abstract,Keywords
Assessment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1,20967,20967,20561,20148,12862,20287,20966,20966,20259
1,695,695,689,689,560,694,695,695,653


In [5]:
len(train_df)

21662

In [6]:
print('# of Positive (-1): '+  str(train_df.groupby(['Assessment']).count()['Docid'][1] /len(train_df)*100))
print('# of Negative (1): '+ str(train_df.groupby(['Assessment']).count()['Docid'][-1]/len(train_df)*100))

# of Positive (-1): 3.208383344104884
# of Negative (1): 96.79161665589511


In [7]:
train_df['Language'].unique()

array(['eng', 'ger', 'fre', 'por', nan, 'chi', 'spa', 'afr', 'dut'],
      dtype=object)

In [8]:
print('Size of train dataset', train_df.shape)
print('Size of dev dataset', dev_df.shape)
print('Size of test dataset', test_df.shape)

Size of train dataset (21662, 10)
Size of dev dataset (4850, 10)
Size of test dataset (4814, 10)


In [9]:
# Normalization
#     lower-case words
#     Change short term to long terms for verb.
#     remove punctuation
#         https://www.geeksforgeeks.org/python-remove-punctuation-from-string/
#     remove numbers

def normalization(word):
    word = word.replace("'",'')
    word  = re.sub(r'[^\w\s]', '', word)
    word = word.translate(str.maketrans('', '', string.punctuation))
    return word

In [10]:
# Preprocess dataset 
#   Normalization
#   Removed Stop words 
def preprocess(data):
    result = []
    for line in data:
        word = normalization(line)
        word = word.lower().strip().split()
        stopwords = nltk.corpus.stopwords.words("english")
        word = [w for w in word if not w in stopwords]
        word = " ".join(word)
        result.append(word)
    return result

In [11]:
def precision_recall_f1score(pred, truth):
    TP, FP, FN, TN = 0, 0, 0, 0
    for p, t in zip(pred, truth):
        if p==1:
            if t ==1:
                TP +=1
            if t == -1:
                FP +=1
        if p ==-1:
            if t == 1:
                FN+=1
            if t == -1:
                TN +=1
    precision = TP / (TP + FP)    
    recall = TP / (TP + FN)
    f1_score = 2*precision*recall/(precision+recall)
    corr,total = TN+TP , TN+TP+FN+FP
    accuracy = corr/total
    print("Recall: {}/{} = {} %".format(TP, TP+FN, recall*100))
    print("Precision: {}/{} = {} %".format(TP, TP+FP, precision*100) )
    print("F1 score: {}".format(f1_score))
    print("Accuracy: {}/{} = {} %".format(corr,total,accuracy*100) )
    return recall, precision, f1_score, accuracy

In [12]:
train_clean_df = preprocess(train_df['Title'])
dev_clean_df = preprocess(dev_df['Title'])
test_clean_df =  preprocess(test_df['Title'])

In [13]:
vectorize = TfidfVectorizer()
train_X = vectorize.fit_transform(train_clean_df)

dev_X= vectorize.transform(dev_clean_df)

train_Y = train_df['Assessment']

In [14]:
print('BernoulliNB')

BernoulliNB


In [15]:
clf_nb = BernoulliNB(alpha=0.1)
clf_nb.fit(train_X, train_Y)

In [16]:
print("Train BernoulliNB Title")
preds_nb = clf_nb.predict(train_X)
precision_recall_f1score(preds_nb, train_df['Assessment'])

Train BernoulliNB Title
Recall: 527/695 = 75.8273381294964 %
Precision: 527/862 = 61.1368909512761 %
F1 score: 0.6769428387925497
Accuracy: 21159/21662 = 97.67796140707229 %


(0.758273381294964, 0.611368909512761, 0.6769428387925497, 0.9767796140707229)

In [17]:
print("dev_X BernoulliNB Title")
preds_nb_dev = clf_nb.predict(dev_X)
precision_recall_f1score(preds_nb_dev, dev_df['Assessment'])

dev_X BernoulliNB Title
Recall: 54/150 = 36.0 %
Precision: 54/110 = 49.09090909090909 %
F1 score: 0.4153846153846154
Accuracy: 4698/4850 = 96.8659793814433 %


(0.36, 0.4909090909090909, 0.4153846153846154, 0.968659793814433)

## Experiment #1: Is More Text Better?

In [18]:
train_df.head(1)

Unnamed: 0,Assessment,Docid,Title,Authors,Journal,ISSN,Year,Language,Abstract,Keywords
0,-1,hash:3f1ebe70-a242-3b43-843c-eef89284607a,Misoprostol for treating postpartum haemorrhag...,"Hofmeyr, G. J.;Ferreira, S.;Nikodem, V. C.;Man...",BMC Pregnancy and Childbirth,1471-2393,2004,eng,Background: Postpartum haemorrhage remains an ...,South Africa;adult;article;blood transfusion;c...


In [19]:
train_df['Abstract'] = train_df['Abstract'].fillna('')
dev_df['Abstract'] = dev_df['Abstract'].fillna('')
test_df['Abstract'] = test_df['Abstract'].fillna('')

train_df['Keywords'] = train_df['Keywords'].fillna('')
dev_df['Keywords'] = dev_df['Keywords'].fillna('')
test_df['Keywords'] = test_df['Keywords'].fillna('')

In [20]:
train_long_df = train_df.assign(More_Text = lambda train_en: train_en['Title'] + " " + train_en['Abstract'] + " " + train_en['Keywords'])
dev_long_df = dev_df.assign(More_Text = lambda dev_en: dev_en['Title'] + " " + dev_en['Abstract'] + " " + dev_en['Keywords'])
test_long_df = test_df.assign(More_Text = lambda test_en: test_en['Title'] + " " + test_en['Abstract'] + " " + test_en['Keywords'])

In [21]:
print("Title, Abstract, Keywords")

Title, Abstract, Keywords


In [22]:
train_clean_long_df = preprocess(train_long_df['More_Text'])
dev_clean_long_df = preprocess(dev_long_df['More_Text'])
test_clean_long_df =  preprocess(test_long_df['More_Text'])

In [23]:
vectorize = TfidfVectorizer()
train_long_X = vectorize.fit_transform(train_clean_long_df)

dev_long_X= vectorize.transform(dev_clean_long_df)

train_long_Y = train_long_df['Assessment']

In [24]:
print('BernoulliNB')

BernoulliNB


In [25]:
clf_nb_long = BernoulliNB(alpha=0.1)
clf_nb_long.fit(train_long_X, train_long_Y)

In [26]:
print("Train_long_X BernoulliNB")
preds_nb_long = clf_nb_long.predict(train_long_X)
precision_recall_f1score(preds_nb_long, train_long_df['Assessment'])

Train_long_X BernoulliNB
Recall: 645/695 = 92.80575539568345 %
Precision: 645/974 = 66.22176591375771 %
F1 score: 0.7729179149191133
Accuracy: 21283/21662 = 98.25039239220756 %


(0.9280575539568345, 0.662217659137577, 0.7729179149191133, 0.9825039239220755)

In [27]:
print("Dev_long_X BernoulliNB")
preds_nb_dev_long = clf_nb_long.predict(dev_long_X)
precision_recall_f1score(preds_nb_dev_long, dev_long_df['Assessment'])

Dev_long_X BernoulliNB
Recall: 48/150 = 32.0 %
Precision: 48/86 = 55.81395348837209 %
F1 score: 0.4067796610169492
Accuracy: 4710/4850 = 97.11340206185567 %


(0.32, 0.5581395348837209, 0.4067796610169492, 0.9711340206185567)

## Experiment #2: Surprise Me
Conduct another experiment, preferably something non-trivial. 
1. Using Another Machine Learning Model

In [28]:
print("Experiment #2")

Experiment #2


### KNN

In [29]:
print("KNN")

KNN


In [30]:
clf_knn=KNeighborsClassifier(n_neighbors=5, weights='distance')
clf_knn.fit(train_long_X, train_long_Y)

In [31]:
print("Train_long_X KNN")
preds_knn = clf_knn.predict(train_long_X)
precision_recall_f1score(preds_knn, train_long_df['Assessment'])

Train_long_X KNN
Recall: 694/695 = 99.85611510791367 %
Precision: 694/694 = 100.0 %
F1 score: 0.9992800575953923
Accuracy: 21661/21662 = 99.99538362108763 %


(0.9985611510791367, 1.0, 0.9992800575953923, 0.9999538362108762)

In [32]:
print("Dev_long_X KNN")
preds_knn_dev_long =clf_knn.predict(dev_long_X)
precision_recall_f1score(preds_knn_dev_long, dev_long_df['Assessment'])

Dev_long_X KNN
Recall: 14/150 = 9.333333333333334 %
Precision: 14/36 = 38.88888888888889 %
F1 score: 0.15053763440860216
Accuracy: 4692/4850 = 96.74226804123711 %


(0.09333333333333334,
 0.3888888888888889,
 0.15053763440860216,
 0.9674226804123711)

### PassiveAggressiveClassifier

In [33]:
print("PassiveAggressiveClassifier")

PassiveAggressiveClassifier


In [34]:
clf_pass = PassiveAggressiveClassifier(max_iter=100)
clf_pass.fit(train_long_X, train_long_Y)

In [35]:
print("Train_long_X PassiveAggressiveClassifier")
preds_pass = clf_pass.predict(train_long_X)
precision_recall_f1score(preds_pass, train_long_df['Assessment'])

Train_long_X PassiveAggressiveClassifier
Recall: 695/695 = 100.0 %
Precision: 695/696 = 99.85632183908046 %
F1 score: 0.9992810927390366
Accuracy: 21661/21662 = 99.99538362108763 %


(1.0, 0.9985632183908046, 0.9992810927390366, 0.9999538362108762)

In [36]:
print("Dev_long_X PassiveAggressiveClassifier")
preds_pass_dev_long =clf_pass.predict(dev_long_X)
precision_recall_f1score(preds_pass_dev_long, dev_long_df['Assessment'])

Dev_long_X PassiveAggressiveClassifier
Recall: 48/150 = 32.0 %
Precision: 48/78 = 61.53846153846154 %
F1 score: 0.4210526315789474
Accuracy: 4718/4850 = 97.27835051546391 %


(0.32, 0.6153846153846154, 0.4210526315789474, 0.9727835051546392)

### SVM

In [37]:
print("SVM Linear SVC")

SVM Linear SVC


In [38]:
clf_svm = svm.LinearSVC(C=2)
clf_svm.fit(train_long_X, train_long_Y)

In [39]:
print("Train_long_X SVM")
preds_svm = clf_svm.predict(train_long_X)
precision_recall_f1score(preds_svm, train_long_df['Assessment'])

Train_long_X SVM
Recall: 693/695 = 99.71223021582733 %
Precision: 693/693 = 100.0 %
F1 score: 0.9985590778097982
Accuracy: 21660/21662 = 99.99076724217524 %


(0.9971223021582734, 1.0, 0.9985590778097982, 0.9999076724217524)

In [40]:
print("Dev_long_X SVM")
preds_dev_long =clf_svm.predict(dev_long_X)
precision_recall_f1score(preds_dev_long, dev_long_df['Assessment'])

Dev_long_X SVM
Recall: 44/150 = 29.333333333333332 %
Precision: 44/66 = 66.66666666666666 %
F1 score: 0.4074074074074074
Accuracy: 4722/4850 = 97.36082474226804 %


(0.29333333333333333,
 0.6666666666666666,
 0.4074074074074074,
 0.9736082474226804)

In [41]:
test_long_X= vectorize.transform(test_clean_long_df)

In [42]:
pred_svm = clf_svm.predict(test_long_X)

In [43]:
with open('dcho13.txt', 'w') as f:
    for x in zip(test_df['Docid'], pred_svm):
        f.write('%s\t%i\n' % x)