In [12]:
import os
import pandas as pd
import json

In [13]:
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

In [14]:
def get_src_text(row):
    path = "data/en/" + row['event'] + "/" + str(row['threadid']) + "/source-tweets/" + str(row['tweetid']) + ".json"
    with open(path, "r") as f:
        source = json.loads(f.read())
        return source['text']

def is_true(row):
    print(row)
    path = "data/en/" + row['event'] + "/" + str(row['threadid']) + "/annotation.json"
    with open(path, "r") as f:
        source = json.loads(f.read())
        return str(source.get('true', 'unverified'))

In [15]:
# Dataset used: https://figshare.com/articles/dataset/PHEME_rumour_scheme_dataset_journalism_use_case/2068650
df = pd.read_json("data/en-scheme-annotations.json", dtype = {"threadid": str, "tweetid": str}, lines=True)

df['true'] = df.apply(lambda row: is_true(row), axis=1)
df['src_text'] = df.apply(lambda row: get_src_text(row), axis=1)

event                  putinmissing
threadid         577258317942149120
tweetid          577258317942149120
support                  supporting
evidentiality             url-given
certainty          somewhat-certain
Name: 0, dtype: object
event                  putinmissing
threadid         576755174531862529
tweetid          576755174531862529
support                  supporting
evidentiality             url-given
certainty          somewhat-certain
Name: 1, dtype: object
event                  putinmissing
threadid         576319832800555008
tweetid          576319832800555008
support                  supporting
evidentiality             url-given
certainty          somewhat-certain
Name: 2, dtype: object
event                  putinmissing
threadid         576513463738109954
tweetid          576513463738109954
support                     denying
evidentiality             url-given
certainty                   certain
Name: 3, dtype: object
event                  charliehebdo
threadid

event             germanwings-crash
threadid         581386094337474560
tweetid          581386094337474560
support                  supporting
evidentiality             url-given
certainty                   certain
Name: 126, dtype: object
event             germanwings-crash
threadid         580324027715063808
tweetid          580324027715063808
support                  supporting
evidentiality             url-given
certainty                   certain
Name: 127, dtype: object
event             germanwings-crash
threadid         580325090367315968
tweetid          580325090367315968
support                  supporting
evidentiality             url-given
certainty          somewhat-certain
Name: 128, dtype: object
event             germanwings-crash
threadid         581063377226637312
tweetid          581063377226637312
support                  supporting
evidentiality      picture-attached
certainty                   certain
Name: 129, dtype: object
event             germanwings-crash


event                ottawashooting
threadid         525056576038518785
tweetid          525056576038518785
support                  supporting
evidentiality             url-given
certainty                   certain
Name: 238, dtype: object
event                ottawashooting
threadid         524948866773184512
tweetid          524948866773184512
support                  supporting
evidentiality         source-quoted
certainty                   certain
Name: 239, dtype: object
event                ottawashooting
threadid         525028734991343617
tweetid          525028734991343617
support                  supporting
evidentiality         source-quoted
certainty                   certain
Name: 240, dtype: object
event                        ottawashooting
threadid                 525025463648137216
tweetid                  525025463648137216
support                          supporting
evidentiality    unverifiable-source-quoted
certainty                           certain
Name: 241, dt

In [16]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df['src_text'], df['true'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3)


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [17]:
print(val_labels.value_counts())
print(train_text)

0             22
1             18
unverified     5
Name: true, dtype: int64
108    PRINCE IN TORONTO TONIGHT:\n@3RDEYEGIRL tweete...
85     Up to 20 held hostage in Sydney Lindt Cafe sie...
170    Five hostages have escaped the besieged Lindt ...
76     #sydneysiege is over. 2 confirmed dead, #PrayF...
286    BREAKING: Three hostages appear to have escape...
                             ...                        
156    BREAKING  @SkyBusiness: freed hostage borne hi...
265    DETAILS: The hostage site is Lindt Chocolat Ca...
226    Currently the #FoxNews website has zero, repea...
102    Police convoy and helicopters are rushing to s...
250    Local media: 3 people appear to escape from Ma...
Name: src_text, Length: 207, dtype: object


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Get features for TF-IDF
tfidf = TfidfVectorizer()
tfidf.fit(train_text)
train_features = tfidf.transform(train_text)
validation_features = tfidf.transform(val_text)
test_features = tfidf.transform(test_text)

In [19]:
def evaluation_summary(description, true_labels, predictions):
  print("Evaluation for: " + description)
  print(classification_report(true_labels, predictions,  digits=3, zero_division=0))
  print('\nConfusion matrix:\n',confusion_matrix(true_labels, predictions))

In [20]:
# SVC
svc = SVC(kernel='rbf')
svc_model = svc.fit(train_features, train_labels)
print(svc_model.score(validation_features, val_labels))
svc_predicted_labels = svc_model.predict(validation_features)

evaluation_summary("SVC", val_labels, svc_predicted_labels)
svc_test = svc_model.predict(test_features)
evaluation_summary("SVC test", test_labels, svc_test)

# Logistic Regression with TF-IDF
lr_tfidf = LogisticRegression()
lr_model_tfidf = lr_tfidf.fit(train_features, train_labels)
print(lr_model_tfidf.score(validation_features, val_labels))
print(lr_model_tfidf.score(test_features, test_labels))

lr_predicted_labels_tfidf = lr_model_tfidf.predict(validation_features)
evaluation_summary("LR (TF-IDF)", val_labels, lr_predicted_labels_tfidf)
lrtfidf_test = lr_model_tfidf.predict(test_features)
evaluation_summary("LR (TF-IDF) test", test_labels, lrtfidf_test)

# Dummy Majority
dumb = DummyClassifier(strategy='most_frequent')
dumb.fit(train_features, train_labels)
print(dumb.score(validation_features, val_labels))
dumb_validation_predicted_labels = dumb.predict(validation_features)
evaluation_summary("Dummy majority", val_labels, dumb_validation_predicted_labels)
dumb_test = dumb.predict(test_features)
evaluation_summary("Dummy MF test", test_labels, dumb_test)

0.6
Evaluation for: SVC
              precision    recall  f1-score   support

           0      0.800     0.545     0.649        22
           1      0.500     0.833     0.625        18
  unverified      0.000     0.000     0.000         5

    accuracy                          0.600        45
   macro avg      0.433     0.460     0.425        45
weighted avg      0.591     0.600     0.567        45


Confusion matrix:
 [[12 10  0]
 [ 3 15  0]
 [ 0  5  0]]
Evaluation for: SVC test
              precision    recall  f1-score   support

           0      0.750     0.545     0.632        22
           1      0.500     0.778     0.609        18
  unverified      1.000     0.200     0.333         5

    accuracy                          0.600        45
   macro avg      0.750     0.508     0.525        45
weighted avg      0.678     0.600     0.589        45


Confusion matrix:
 [[12 10  0]
 [ 4 14  0]
 [ 0  4  1]]
0.5777777777777777
0.6222222222222222
Evaluation for: LR (TF-IDF)
         

In [21]:
def get_src_text_by_index(row):
    return df.iloc[row.index]['src_text']

accuracy = pd.DataFrame([test_labels, lrtfidf_test])
print(accuracy)
print(lrtfidf_test)
print(test_labels)
print(type(test_labels))
test_df = pd.DataFrame(test_labels)
test_df['predicted'] = lrtfidf_test
print(test_df.apply(lambda row: get_src_text_by_index(row)))
print(test_df)
#test_df = pd.DataFrame({"tweet_id":test_labels[0], "actual_label": test_labels[1]})
test_labels.add(lrtfidf_test)
print(test_labels)

           127 18   143  161 32   131  177         19          29   172  ...  \
true         1   1    1    1   0    0    0           0  unverified    0  ...   
Unnamed 0  NaN   0  NaN  NaN   0  NaN  NaN  unverified           1  NaN  ...   

           288  202  162 5    58   69  17          210  223  54   
true         1    0    1   1    1    0   0  unverified    0    0  
Unnamed 0  NaN  NaN  NaN   1  NaN  NaN   0         NaN  NaN  NaN  

[2 rows x 45 columns]
['1' '0' '0' '1' '0' '1' '1' '0' '1' '1' '1' '1' '0' '0' '0' '1' '0' '0'
 '0' 'unverified' '1' '1' '0' '0' '1' '1' '1' '1' '1' '1' '1' '1' '0' '0'
 '1' '1' '0' '1' '1' '1' '0' '1' '1' '0' '1']
127             1
18              1
143             1
161             1
32              0
131             0
177             0
19              0
29     unverified
172             0
92              1
51              1
215             0
7               1
206             0
285             1
269             1
118             0
36              0


In [22]:
# Get output

input_text = "Watch video showing gunfire inside Canada's pa"
input_df = pd.DataFrame({"src_text": [input_text]})
input_features = tfidf.transform(input_df)

predicted_label = lr_model_tfidf.predict(input_features)

print(predicted_label)

['0']
