In [1]:
# LIBRARIES IMPORT
import os
import json
import pandas as pd
import numpy as np
import seaborn as sb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# DECLARING CONSTANTS
SOURCE_PATH = './data/all-rnr-annotated-threads'

TOPICS = [
    'charliehebdo-all-rnr-threads',
    'ferguson-all-rnr-threads',
    'ebola-essien-all-rnr-threads',
    'germanwings-crash-all-rnr-threads',
    'gurlitt-all-rnr-threads',
    'ottawashooting-all-rnr-threads',
    'prince-toronto-all-rnr-threads',
    'putinmissing-all-rnr-threads',
    'sydneysiege-all-rnr-threads'
]

SELECTABLE_COLUMNS = [
    'text', 
    'favorite_count',
    'retweeted',
    'retweet_count',
    'favorited',
    'user.profile_use_background_image',
    'user.default_profile_image',
    'user.verified', 
    'user.followers_count',
    'user.listed_count',
    'user.statuses_count',
    'user.description',
    'user.friends_count',
    'user.favourites_count'
]

CLASSIFICATION_DATA = []

In [3]:
def preprocess_topic(source_path,topic):
    topic_path = "{0}/{1}/".format(source_path,topic)
    topic_data = []
    for class_name in ['rumours', 'non-rumours']:
        class_path = topic_path + class_name
        class_dirs = os.listdir(class_path)
        for class_dir in os.listdir(class_path):
            features_set = get_topic_data(class_path,class_dir)
            features_set[:0] = [topic]
            features_set.append(class_name)
            topic_data.append(features_set)
            CLASSIFICATION_DATA.append(features_set)

In [4]:
def get_topic_data(class_path,topic_id):
    data_path = "{0}/{1}/source-tweets/{2}.json".format(class_path,topic_id,topic_id)

    with open(data_path, 'r') as f:
        json_data = json.load(f)
    tweets_df = pd.json_normalize(json_data)
    
    flat_tweets = tweets_df[SELECTABLE_COLUMNS].to_dict('list')
    csv_data = sum(flat_tweets.values(),[])
    csv_data.append(len(os.listdir("{0}/{1}/reactions".format(class_path,topic_id))))
    return(csv_data)

In [5]:
#Data Generation
def create_data_set():
    for topic in TOPICS:
        preprocess_topic(SOURCE_PATH,topic)

def generate_df():
    if CLASSIFICATION_DATA:
        data_ndarray = np.array(CLASSIFICATION_DATA)
        data_columns = ['topic'] +SELECTABLE_COLUMNS+ ['reaction_count', 'category']
        return(pd.DataFrame(data_ndarray, columns=data_columns))
create_data_set()
df = generate_df()

In [6]:
print(df.describe())
print(df["category"].value_counts())
df.head()

                               topic  \
count                           6425   
unique                             9   
top     charliehebdo-all-rnr-threads   
freq                            2079   

                                                     text  favorite_count  \
count                                                6425            6425   
unique                                               6407             732   
top     In a tweet concert promoter Live Nation says t...               0   
freq                                                    2             188   

       retweeted  retweet_count favorited user.profile_use_background_image  \
count       6425           6425      6425                              6425   
unique         1           1070         1                                 2   
top        False              2     False                              True   
freq        6425            156      6425                              5619   

       user.defau

Unnamed: 0,topic,text,favorite_count,retweeted,retweet_count,favorited,user.profile_use_background_image,user.default_profile_image,user.verified,user.followers_count,user.listed_count,user.statuses_count,user.description,user.friends_count,user.favourites_count,reaction_count,category
0,charliehebdo-all-rnr-threads,Report: Hostages taken at a printing shop nort...,60,False,272,False,True,False,True,7197412,89498,89005,Get our free iOS or Android app to unlock more...,509,19,8,rumours
1,charliehebdo-all-rnr-threads,#CharlieHebdo editor-in-chief Stephane Charbon...,179,False,858,False,True,False,True,12909401,102656,22778,Breaking news alerts and updates from the BBC....,3,0,10,rumours
2,charliehebdo-all-rnr-threads,Hostages have reportedly been taken as police ...,1,False,143,False,True,False,True,1790312,14711,122439,Stories direct from the @SkyNews newsroom. Twe...,17,2,2,rumours
3,charliehebdo-all-rnr-threads,Suspects in Thursday slaying of policewoman an...,19,False,117,False,True,False,False,213434,2454,22939,A global news channel bringing you breaking ne...,2267,295,8,rumours
4,charliehebdo-all-rnr-threads,#BREAKING - An armed man has reportedly taken ...,23,False,214,False,True,False,True,1258380,6512,144267,International News 24/7 in your language: @Fra...,380,424,7,rumours


In [7]:
# PREPROCESSING DATA
df['text_length']  = df['text'].str.len()
df["contain_hashtags"] = df["text"].str.contains("#")
df['user.description_length']  = df['user.description'].str.len()
df['user.description_contain_at']  = df['user.description'].str.contains("@")

In [8]:
# Label encoding boolean columns
categorical_columns = [
    'retweeted',
    'favorited',
    'user.profile_use_background_image',
    'user.default_profile_image',
    'user.verified',
    'category',
    'contain_hashtags',
    'user.description_contain_at'
]

le = LabelEncoder()
df[categorical_columns] = df[categorical_columns].apply(le.fit_transform)

# One hot encoding different topic categories
df = pd.concat([df,pd.get_dummies(df['topic'])],axis=1)

# Drop original columns after modification
df = df.drop(['topic','text', 'user.description'], axis=1)

In [9]:
#Handling missing values by imputations
df['user.description_length'].fillna(df['user.description_length'].mean(), inplace = True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6425 entries, 0 to 6424
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   favorite_count                     6425 non-null   object 
 1   retweeted                          6425 non-null   int64  
 2   retweet_count                      6425 non-null   object 
 3   favorited                          6425 non-null   int64  
 4   user.profile_use_background_image  6425 non-null   int64  
 5   user.default_profile_image         6425 non-null   int64  
 6   user.verified                      6425 non-null   int64  
 7   user.followers_count               6425 non-null   object 
 8   user.listed_count                  6425 non-null   object 
 9   user.statuses_count                6425 non-null   object 
 10  user.friends_count                 6425 non-null   object 
 11  user.favourites_count              6425 non-null   objec

In [None]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6425 entries, 0 to 6424
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   favorite_count                     6425 non-null   object 
 1   retweeted                          6425 non-null   int64  
 2   retweet_count                      6425 non-null   object 
 3   favorited                          6425 non-null   int64  
 4   user.profile_use_background_image  6425 non-null   int64  
 5   user.default_profile_image         6425 non-null   int64  
 6   user.verified                      6425 non-null   int64  
 7   user.followers_count               6425 non-null   object 
 8   user.listed_count                  6425 non-null   object 
 9   user.statuses_count                6425 non-null   object 
 10  user.friends_count                 6425 non-null   object 
 11  user.favourites_count              6425 non-null   objec

favorite_count                       0
retweeted                            0
retweet_count                        0
favorited                            0
user.profile_use_background_image    0
user.default_profile_image           0
user.verified                        0
user.followers_count                 0
user.listed_count                    0
user.statuses_count                  0
user.friends_count                   0
user.favourites_count                0
reaction_count                       0
category                             0
text_length                          0
contain_hashtags                     0
user.description_length              0
user.description_contain_at          0
charliehebdo-all-rnr-threads         0
ebola-essien-all-rnr-threads         0
ferguson-all-rnr-threads             0
germanwings-crash-all-rnr-threads    0
gurlitt-all-rnr-threads              0
ottawashooting-all-rnr-threads       0
prince-toronto-all-rnr-threads       0
putinmissing-all-rnr-thre

In [None]:
target = df['category']
features = df.drop(['category'], axis=1)

In [30]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0)

In [31]:
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

In [32]:
predictions = logisticRegr.predict(x_test)
from sklearn.metrics import classification_report, confusion_matrix

In [33]:
y_pred = logisticRegr.predict(x_test)
score_ = logisticRegr.score(x_test, y_test)
report = classification_report(y_test, y_pred)

In [34]:
score_

0.6297448662103298

In [35]:
print(report)

              precision    recall  f1-score   support

           0       0.64      0.95      0.77      1021
           1       0.45      0.07      0.12       586

    accuracy                           0.63      1607
   macro avg       0.55      0.51      0.44      1607
weighted avg       0.57      0.63      0.53      1607

