In [18]:
#Numpy deals with large arrays and linear algebra
import numpy as np
# Library for data manipulation and analysis
import pandas as pd 
 
# Metrics for Evaluation of model Accuracy and F1-score
from sklearn.metrics  import f1_score, accuracy_score, precision_score
 
#Importing the Decision Tree from scikit-learn library
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
# Support Vector Machine
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# For splitting of data into train and test set
from sklearn.model_selection import train_test_split

import os
import re
import json
import pickle


In [2]:

def create_dataframe(account_data_list, dataset_type):
    dataframe = pd.DataFrame({})
    
    if dataset_type == "automated":
        for account_data in account_data_list:
            user_follower_count = account_data["userFollowerCount"]
            user_following_count = account_data["userFollowingCount"]
            follower_following_ratio = user_follower_count/max(1,user_following_count)
            
            temp_dataframe = pd.Series({"user_media_count":account_data["userMediaCount"],
                                        "user_follower_count":account_data["userFollowerCount"],
                                        "user_following_count":account_data["userFollowingCount"],
                                        "user_has_highligh_reels":account_data["userHasHighlighReels"],
                                        "user_has_external_url":account_data["userHasExternalUrl"],
                                        "user_tags_count":account_data["userTagsCount"],
                                        "follower_following_ratio":follower_following_ratio,
                                        "user_biography_length":account_data["userBiographyLength"],
                                        "username_length":account_data["usernameLength"],
                                        "username_digit_count":account_data["usernameDigitCount"],
                                        "media_comment_numbers":account_data["mediaCommentNumbers"],
                                        "media_comments_are_disabled":account_data["mediaCommentNumbers"],
                                        "media_has_location_info":account_data["mediaHasLocationInfo"],
                                        "media_hashtag_numbers":account_data["mediaHashtagNumbers"],
                                        "media_like_numbers":account_data["mediaLikeNumbers"],
                                        "mediaUpload_times":account_data["mediaUploadTimes"],
                                        "automated_behaviour":account_data["automatedBehaviour"]
                                        })
            dataframe = dataframe.append(temp_dataframe, ignore_index=True)
            
    elif dataset_type == "fake":
        for account_data in account_data_list:
            user_follower_count = account_data["userFollowerCount"]
            user_following_count = account_data["userFollowingCount"]
            follower_following_ratio = user_follower_count/max(1,user_following_count)
            
            temp_dataframe = pd.Series({"user_media_count":account_data["userMediaCount"],
                                      "user_follower_count":account_data["userFollowerCount"],
                                      "user_following_count":account_data["userFollowingCount"],
                                      "user_has_profil_pic":account_data["userHasProfilPic"],
                                      "user_is_private":account_data["userIsPrivate"],
                                      "follower_following_ratio":follower_following_ratio,
                                      "user_biography_length":account_data["userBiographyLength"],
                                      "username_length":account_data["usernameLength"],
                                      "username_digit_count":account_data["usernameDigitCount"],
                                      "is_fake":account_data["isFake"]
                                        })
            dataframe = dataframe.append(temp_dataframe, ignore_index=True)
    return dataframe

#%% Import automated/nonautomated data
    
def import_data(dataset_path, dataset_version):
    #base_path = os.path.dirname(os.path.abspath(__file__))
    #base_path = "/Users/fca/Documents/GitHub/instafake-dataset"
    dataset_type = re.findall("automated|fake",dataset_version)[0]
    if dataset_type == "automated":
        with open(dataset_path + "/" + dataset_version + "/automatedAccountData.json") as json_file:
            automated_account_data = json.load(json_file)
        with open(dataset_path + "/" + dataset_version + "/nonautomatedAccountData.json") as json_file:
            nonautomated_account_data = json.load(json_file)
            
        automated_account_dataframe = create_dataframe(automated_account_data, dataset_type)
        nonautomated_account_dataframe = create_dataframe(nonautomated_account_data, dataset_type)
        merged_dataframe = automated_account_dataframe.append(nonautomated_account_dataframe, ignore_index=True)
        data = dict({"dataset_type":dataset_type,
                     "dataframe":merged_dataframe})
    
    elif dataset_type == "fake":
        with open(dataset_path + "/" + dataset_version + "/fakeAccountData.json") as json_file:
            fake_account_data = json.load(json_file)
        with open(dataset_path + "/" + dataset_version + "/realAccountData.json") as json_file:
            real_account_data = json.load(json_file)
            
        fake_account_dataframe = create_dataframe(fake_account_data, dataset_type)
        real_account_dataframe = create_dataframe(real_account_data, dataset_type)
        merged_dataframe = fake_account_dataframe.append(real_account_dataframe, ignore_index=True)
        data = dict({"dataset_type":dataset_type,
                     "dataframe":merged_dataframe})
            
    return data

In [4]:
fake_df = pd.read_json("../data/fake/fake-data.json")
real_df = pd.read_json("../data/fake/real-data.json")
df = pd.concat([fake_df,real_df])

In [5]:
y = df.isFake
X = df.drop(["isFake"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)

In [6]:
X_test.describe()

Unnamed: 0,userFollowerCount,userFollowingCount,userBiographyLength,userMediaCount,userHasProfilPic,userIsPrivate,usernameDigitCount,usernameLength
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,369.133779,647.672241,20.846154,57.702341,0.926421,0.682274,0.448161,11.070234
std,344.570293,828.622573,31.591931,110.422078,0.261522,0.466373,1.117203,2.837622
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
25%,154.5,256.5,0.0,3.0,1.0,0.0,0.0,9.0
50%,319.0,454.0,3.0,20.0,1.0,1.0,0.0,11.0
75%,471.5,690.5,32.0,61.5,1.0,1.0,0.0,13.0
max,3140.0,7493.0,150.0,875.0,1.0,1.0,7.0,21.0


In [9]:
# Training the model is as simple as this
# Use the function imported above and apply fit() on it
DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)
# We use the predict() on the model to predict the output
pred=DT.predict(X_test)
 
# for classification we use accuracy and F1 score
print(accuracy_score(y_test,pred))
print(f1_score(y_test,pred))
 

0.9565217391304348
0.8505747126436781


In [11]:
# for regression we use R2 score and MAE(mean absolute error)
# all other steps will be same as classification as shown above
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
print(mean_absolute_error(y_test,pred))
print(mean_absolute_error(y_test,pred))

0.043478260869565216
0.043478260869565216


In [34]:
def evaluate_using_model(class_, X_train,X_test,y_train,y_test):
    c = class_()
    c.fit(X_train,y_train)
    pred = c.predict(X_test)
    # for classification we use accuracy and F1 score
    filename = f'./trained_models/{class_.__name__}_finalized_model.sav'
    pickle.dump(c, open(filename, 'wb'))
    
    return accuracy_score(y_test,pred), f1_score(y_test,pred), precision_score(y_test,pred)

In [44]:
goodness = {}
classifiers = [
    RandomForestClassifier,
    DecisionTreeClassifier,
    #     LogisticRegression,
    SVC,
    GaussianNB,
    MultinomialNB,
    SGDClassifier,
    GradientBoostingClassifier,
]
for classifier in classifiers:
    accuracy, f1, precision = evaluate_using_model(
        classifier, X_train, X_test, y_train, y_test
    )
    goodness[classifier.__name__] = {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
    }

In [36]:
df = pd.DataFrame(goodness).T
df.sort_values("accuracy", ascending=False)

Unnamed: 0,accuracy,f1,precision
RandomForestClassifier,0.963211,0.873563,0.95
GradientBoostingClassifier,0.956522,0.847059,0.947368
DecisionTreeClassifier,0.949833,0.827586,0.9
MultinomialNB,0.946488,0.809524,0.918919
LogisticRegression,0.939799,0.775,0.939394
SVC,0.923077,0.693333,0.928571
GaussianNB,0.909699,0.703297,0.727273
SGDClassifier,0.842809,0.0,0.0
