In [26]:
@InProceedings{joulin2017bag,
  title={Bag of Tricks for Efficient Text Classification},
  author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
  booktitle={Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers},
  month={April},
  year={2017},
  publisher={Association for Computational Linguistics},
  pages={427--431},
}

In [4]:
import fasttext
import pandas as pd

DATA_DIR = "../traindata/"
MODELS_DIR = "../models/"
RESULT_DIR = "../data/"

pretrained = DATA_DIR + "wiki-news-300d-1M-subword.vec"

In [5]:
# Skipgram model
before_model = fasttext.supervised(DATA_DIR + 'before_train.txt', MODELS_DIR + 'before_model', epoch='50')
after_model = fasttext.supervised(DATA_DIR + 'after_train.txt', MODELS_DIR + 'after_model', epoch='50')
all_model1 = fasttext.supervised(DATA_DIR + 'train.txt', MODELS_DIR + 'all_model', lr='0.5', epoch=100)
#all_model1 = fasttext.supervised(DATA_DIR + 'train.txt', MODELS_DIR + 'all_model', dim=50, epoch=500, lr='1.0', loss='hs')

In [6]:
result = all_model1.test(DATA_DIR + 'valid.txt')
print(result.precision)
print(result.recall)

0.45229982964224874
0.45229982964224874


In [38]:
before_result = before_model.test(DATA_DIR + 'before_valid.txt')
after_result = after_model.test(DATA_DIR + 'after_valid.txt')
#all_resutl = all_model1.test(DATA_DIR + 'valid.txt')
print(before_result.precision)
print(after_result.precision)

print(before_result.nexamples)
print(after_result.nexamples)


0.2676147382029735
0.25561097256857856
1547
802


In [7]:
with open(DATA_DIR + 'valid.json', "r") as file:
    valid_df = pd.read_json(file, orient='index')

In [8]:
valid_df.head()

Unnamed: 0,text,timestamp,tweet,value change
2541,https https https we all voted for you tonight...,2015-10-29 03:48:02,"""@hyatt1942: @realDonaldTrump @piersmorgan WE ...",MEDNEG
12527,president donald j trump proclaims october as ...,2017-10-09 18:35:04,"""President Donald J. Trump Proclaims October 9...",MEDPOS
5831,https https high energy,2016-05-25 05:45:19,"""@buiIdthewall: @realDonaldTrump high energy!""",SMANEG
9855,record high for s p,2017-09-29 13:39:53,RECORD HIGH FOR S &amp; P 500!,BIGPOS
14647,just won lawsuit filed by the dnc and a bunch ...,2018-07-06 16:57:34,Just won lawsuit filed by the DNC and a bunch ...,BIGPOS


In [9]:
texts = [text for text in valid_df['text']]
print(texts[0:10])

['https https https we all voted for you tonight as winner of the debate', 'president donald j trump proclaims october as columbusday https', 'https https high energy', 'record high for s p', 'just won lawsuit filed by the dnc and a bunch of democrat crazies trying to claim the trump campaign and others https', 'fake https made up a story that i wanted a tenfold increase in our u s nuclear arsenal pure fiction made up to demean nbc cnn', 'https https it defies belief the web of lies hillary is spinning one excuse after another then it s this then it s that', 'watch https on now https', 'https https ia caucus hasn t picked nominee in years cruz dirty tricks stole it trump way ahead in primary states', 'https vito thank you mr trump for standing up for our country votetrump join me on the trumptrain https']


In [16]:
labels = all_model1.predict(texts)
labels = [label[0] for label in labels]
print(labels[0:10])

['BIGPOS', 'MEDPOS', 'BIGPOS', 'BIGPOS', 'SMAPOS', 'MEDNEG', 'MEDPOS', 'MEDNEG', 'MEDPOS', 'MEDNEG']


In [35]:
t = texts[0] # this would return a list of one tweet: texts[0:1]
t = [t] #t has to be a list
print (t)

l = all_model1.predict_proba(t, 6) #predict 6 most probable classes and show probabilities
print (l)

#print available labels
print(all_model1.labels)

['https https https we all voted for you tonight as winner of the debate']
[[('BIGPOS', 0.566406), ('SMANEG', 0.345703), ('BIGNEG', 0.0839844), ('MEDPOS', 1.95313e-08), ('SMAPOS', 1.95313e-08), ('MEDNEG', 1.95313e-08)]]
['MEDNEG', 'MEDPOS', 'BIGPOS', 'SMANEG', 'BIGNEG', 'SMAPOS']


In [25]:
valid_df[0:10]

Unnamed: 0,text,timestamp,tweet,value change
2541,https https https we all voted for you tonight...,2015-10-29 03:48:02,"""@hyatt1942: @realDonaldTrump @piersmorgan WE ...",MEDNEG
12527,president donald j trump proclaims october as ...,2017-10-09 18:35:04,"""President Donald J. Trump Proclaims October 9...",MEDPOS
5831,https https high energy,2016-05-25 05:45:19,"""@buiIdthewall: @realDonaldTrump high energy!""",SMANEG
9855,record high for s p,2017-09-29 13:39:53,RECORD HIGH FOR S &amp; P 500!,BIGPOS
14647,just won lawsuit filed by the dnc and a bunch ...,2018-07-06 16:57:34,Just won lawsuit filed by the DNC and a bunch ...,BIGPOS
10029,fake https made up a story that i wanted a ten...,2017-10-11 13:45:22,Fake @NBCNews made up a story that I wanted a ...,MEDNEG
6931,https https it defies belief the web of lies h...,2016-09-05 01:24:26,"""@CherNuna: @realDonaldTrump It defies belief ...",MEDPOS
4365,watch https on now https,2016-02-05 01:03:23,Watch @AC360 on NOW! @CNN,BIGNEG
4394,https https ia caucus hasn t picked nominee in...,2016-02-06 20:04:09,"""@realOllieTaylor: @paintonmy: IA caucus hasn'...",BIGNEG
5938,https vito thank you mr trump for standing up ...,2016-06-04 12:49:18,"""@Don_Vito_08: Thank You Mr. Trump for Standin...",MEDPOS


In [26]:
valid_df['predicted changes'] = labels

In [27]:
valid_df.head()

Unnamed: 0,text,timestamp,tweet,value change,predicted changes
2541,https https https we all voted for you tonight...,2015-10-29 03:48:02,"""@hyatt1942: @realDonaldTrump @piersmorgan WE ...",MEDNEG,BIGPOS
12527,president donald j trump proclaims october as ...,2017-10-09 18:35:04,"""President Donald J. Trump Proclaims October 9...",MEDPOS,MEDPOS
5831,https https high energy,2016-05-25 05:45:19,"""@buiIdthewall: @realDonaldTrump high energy!""",SMANEG,BIGPOS
9855,record high for s p,2017-09-29 13:39:53,RECORD HIGH FOR S &amp; P 500!,BIGPOS,BIGPOS
14647,just won lawsuit filed by the dnc and a bunch ...,2018-07-06 16:57:34,Just won lawsuit filed by the DNC and a bunch ...,BIGPOS,SMAPOS


In [28]:
with open(RESULT_DIR + 'results.json', 'w') as file:
    file.write(valid_df.to_json(orient='index'))

In [29]:
# Test if the json is readable
with open(RESULT_DIR + 'results.json', 'r') as file:
    testi = pd.read_json(file, orient='index')

In [30]:
testi.head()

Unnamed: 0,predicted changes,text,timestamp,tweet,value change
2541,BIGPOS,https https https we all voted for you tonight...,2015-10-29 03:48:02,"""@hyatt1942: @realDonaldTrump @piersmorgan WE ...",MEDNEG
12527,MEDPOS,president donald j trump proclaims october as ...,2017-10-09 18:35:04,"""President Donald J. Trump Proclaims October 9...",MEDPOS
5831,BIGPOS,https https high energy,2016-05-25 05:45:19,"""@buiIdthewall: @realDonaldTrump high energy!""",SMANEG
9855,BIGPOS,record high for s p,2017-09-29 13:39:53,RECORD HIGH FOR S &amp; P 500!,BIGPOS
14647,SMAPOS,just won lawsuit filed by the dnc and a bunch ...,2018-07-06 16:57:34,Just won lawsuit filed by the DNC and a bunch ...,BIGPOS
