In [None]:
import pandas as pd
import seaborn as sns
import random
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import bz2
import json
from collections import defaultdict, Counter
from tqdm.notebook import tqdm
from sklearn.pipeline import Pipeline, make_pipeline
# from lime.lime_text import LimeTextExplainer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, f1_score
import time
from sklearn.dummy import DummyClassifier
from simpletransformers.classification import ClassificationModel
%matplotlib inline

In [None]:
!pip install simpletransformers

Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/c8/1e/13448b9e3c07e94dd035d9a791b66e08a4c7760423c120c11ea863e41c11/simpletransformers-0.51.3-py3-none-any.whl (199kB)
[K     |████████████████████████████████| 204kB 9.7MB/s 
[?25hCollecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/ca/5e/9df94df3bfee51b92b54a5e6fa277d6e1fcdf1f27b1872214b98f55ec0f7/wandb-0.10.12-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 14.8MB/s 
[?25hCollecting tqdm>=4.47.0
[?25l  Downloading https://files.pythonhosted.org/packages/8a/54/115f0c28a61d56674c3a5e05c46d6c3523ad196e1dcd3e2d8b119026df36/tqdm-4.54.1-py2.py3-none-any.whl (69kB)
[K     |████████████████████████████████| 71kB 9.8MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K  

# Option 1 (fast way): Download the files off of Canvas

In [None]:
train_df = pd.read_csv('health_train.csv',encoding='utf-8')
test_df = pd.read_csv('health_test.csv',encoding='utf-8')

In [None]:
test_df.head()

Unnamed: 0,text,label
0,Frances Ellery provided significant editorial ...,1
1,"Economic, social, educational, and health impa...",0
2,Guidance Note 10 Sanitation Marketing and CAT...,0
3,- JMP 2012 What is necessary for menstrual hy...,0
4,"Baseline in 48 LGAs of 20 States indicates 2,...",0


In [None]:
train_df.text.astype(str)
train_df.label.astype(int)

0      1
1      1
2      1
3      1
4      1
      ..
169    0
170    1
171    1
172    0
173    0
Name: label, Length: 174, dtype: int64

In [None]:
test_df.text.astype(str)
test_df.label.astype(int)


0     1
1     0
2     0
3     0
4     0
5     1
6     1
7     1
8     0
9     1
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    1
20    0
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
Name: label, dtype: int64

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df.text)

clf = LogisticRegression()
clf.fit(X_train, train_df.label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
X_test = vectorizer.transform(test_df.text)
y_pred = clf.predict(X_test)        

y_test = test_df.label
f1 = f1_score(y_test, y_pred)
print(f1)

0.0


In [None]:
random_clf = DummyClassifier()
random_clf.fit(X_train, train_df.label)
X_train, train_df.label
y_test = test_df.label

random_f1 = f1_score(y_test, random_clf.predict(X_test))       
print('Random classifier: %f F1' % (random_f1))

Random classifier: 0.275862 F1




In [None]:
model = ClassificationModel('roberta', 'roberta-base', num_labels=2,use_cuda=False,args={'num_train_epochs': 5, 'reprocess_input_data': True, 'overwrite_output_dir': True, 'learning_rate': 5e-5,'evaluate_during_training':  int(len(test_df))})
model.train_model(train_df, eval_df=test_df)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

HBox(children=(FloatProgress(value=0.0, max=174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=22.0, style=ProgressStyle(desc…




  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=22.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=22.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=22.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=22.0, style=ProgressStyle(desc…





(110,
 {'eval_loss': [0.6344525714715322,
   0.7698016427457333,
   0.6922136582434177,
   0.6519547725717226,
   0.7000335467358431],
  'fn': [14, 14, 14, 14, 14],
  'fp': [0, 0, 0, 0, 0],
  'global_step': [22, 44, 66, 88, 110],
  'mcc': [0.0, 0.0, 0.0, 0.0, 0.0],
  'tn': [29, 29, 29, 29, 29],
  'tp': [0, 0, 0, 0, 0],
  'train_loss': [0.8463537096977234,
   0.7320904731750488,
   0.44934725761413574,
   0.16363149881362915,
   0.3923884332180023]})

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=6.0, style=ProgressStyle(descrip…




  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
from sklearn.metrics import f1_score, accuracy_score

def f1_binary(labels, preds):
    return f1_score(labels, preds, average='binary')
    
result, model_outputs, wrong_predictions = model.eval_model(test_df, f1=f1_binary, acc=accuracy_score)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=6.0, style=ProgressStyle(descrip…




  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
result

{'acc': 0.6744186046511628,
 'eval_loss': 0.6314673349261284,
 'f1': 0.5432816537467701,
 'fn': 14,
 'fp': 0,
 'mcc': 0.0,
 'tn': 29,
 'tp': 0}

In [None]:
result

{'acc': 0.6744186046511628,
 'eval_loss': 0.6314673349261284,
 'f1': 0.0,
 'fn': 14,
 'fp': 0,
 'mcc': 0.0,
 'tn': 29,
 'tp': 0}