In [26]:
@InProceedings{joulin2017bag,
  title={Bag of Tricks for Efficient Text Classification},
  author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
  booktitle={Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers},
  month={April},
  year={2017},
  publisher={Association for Computational Linguistics},
  pages={427--431},
}

In [14]:
import fasttext
import pandas as pd

DATA_DIR = "../traindata/"
MODELS_DIR = "../models/"
RESULT_DIR = "../data/"

In [30]:
# Skipgram model
before_model = fasttext.supervised(DATA_DIR + 'before_train.txt', MODELS_DIR + 'before_model', epoch='50')
after_model = fasttext.supervised(DATA_DIR + 'after_train.txt', MODELS_DIR + 'after_model', epoch='50')
all_model = fasttext.supervised(DATA_DIR + 'train.txt', MODELS_DIR + 'all_model', lr='0.5', epoch=500, dim=50)

In [31]:
result = all_model.test(DATA_DIR + 'valid.txt')
print(result.precision)
print(result.nexamples)

0.25228426395939085
1970


In [32]:
before_result = before_model.test(DATA_DIR + 'before_valid.txt')
after_result = after_model.test(DATA_DIR + 'after_valid.txt')
#all_resutl = all_model1.test(DATA_DIR + 'valid.txt')
print(before_result.precision)
print(after_result.precision)

print(before_result.nexamples)
print(after_result.nexamples)


0.26591760299625467
0.27091136079900124
801
801


In [36]:
with open(DATA_DIR + 'valid.json', "r") as file:
    valid_df = pd.read_json(file, orient='index')
    
with open(DATA_DIR + 'before_valid.json', "r") as file:
    before_valid_df = pd.read_json(file, orient='index')
    
with open(DATA_DIR + 'after_valid.json', "r") as file:
    after_valid_df = pd.read_json(file, orient='index')

In [38]:
before_valid_df.head()

Unnamed: 0,text,timestamp,tweet,value change
4109,nato commander agrees members should pay up vi...,2016-07-30 22:24:40,NATO commander agrees members should pay up vi...,MEDNEG
1270,wow even lowly rand paul has just past https i...,2015-12-24 22:20:24,"Wow, even lowly Rand Paul has just past @JebBu...",SMANEG
1830,thank you nashua new hampshire makeamericagrea...,2016-01-29 18:27:38,"Thank you- Nashua, New Hampshire! \n#MakeAmeri...",SMANEG
184,makeamericagreatagain https,2015-10-31 19:24:09,#MakeAmericaGreatAgain https://t.co/UWyANGV8IR,BIGPOS
746,https https https johnkasich all you career po...,2015-11-28 23:10:54,"""@nobaddog: @RepBJNikkel @CindyBlackwel12 John...",MEDNEG


In [41]:
# How to write results to .json
def results(df, model, resultfile):
    texts = [text for text in df['text']]
    labels = model.predict(texts)
    labels = [label[0] for label in labels]
    df['predicted changes'] = labels
    
    with open(RESULT_DIR + resultfile, 'w') as file:
        file.write(df.to_json(orient='index'))

In [43]:
results(valid_df, all_model, 'result.json')
results(before_valid_df, before_model, 'before_result.json')
results(after_valid_df, after_model, 'after_result.json')

In [46]:
with open(RESULT_DIR + 'result.json', 'r') as file:
    testi = pd.read_json(file, orient='index')
testi.describe()

Unnamed: 0,predicted changes,text,timestamp,tweet,value change
count,1970,1970,1970,1970,1970
unique,6,1945,1967,1967,6
top,MEDPOS,makeamericagreatagain trump https,2017-09-30 22:26:55,MAKE AMERICA GREAT AGAIN!,MEDNEG
freq,443,4,2,3,453
first,,,2015-06-14 21:15:20,,
last,,,2018-09-25 10:53:41,,


In [9]:
texts = [text for text in valid_df['text']]
print(texts[0:10])

['https https https we all voted for you tonight as winner of the debate', 'president donald j trump proclaims october as columbusday https', 'https https high energy', 'record high for s p', 'just won lawsuit filed by the dnc and a bunch of democrat crazies trying to claim the trump campaign and others https', 'fake https made up a story that i wanted a tenfold increase in our u s nuclear arsenal pure fiction made up to demean nbc cnn', 'https https it defies belief the web of lies hillary is spinning one excuse after another then it s this then it s that', 'watch https on now https', 'https https ia caucus hasn t picked nominee in years cruz dirty tricks stole it trump way ahead in primary states', 'https vito thank you mr trump for standing up for our country votetrump join me on the trumptrain https']


In [16]:
labels = all_model.predict(texts)
labels = [label[0] for label in labels]
print(labels[0:10])

['BIGPOS', 'MEDPOS', 'BIGPOS', 'BIGPOS', 'SMAPOS', 'MEDNEG', 'MEDPOS', 'MEDNEG', 'MEDPOS', 'MEDNEG']


In [36]:
t = texts[0] # this would return a list of one tweet: texts[0:1]
t = [t] #t has to be a list
print (t)

l = all_model1.predict_proba(t, 6) #predict 6 most probable classes and show probabilities
print (l)
l = all_model1.predict(t)
print (l)

#print available labels
print(all_model1.labels)

['https https https we all voted for you tonight as winner of the debate']
[[('BIGPOS', 0.566406), ('SMANEG', 0.345703), ('BIGNEG', 0.0839844), ('MEDPOS', 1.95313e-08), ('SMAPOS', 1.95313e-08), ('MEDNEG', 1.95313e-08)]]
[['BIGPOS']]
['MEDNEG', 'MEDPOS', 'BIGPOS', 'SMANEG', 'BIGNEG', 'SMAPOS']
