In [10]:
import pathlib

import pandas as pd

from bothunting import definitions

# Auxiliary functions

In [14]:
def show_head(df: pd.DataFrame):
    print(df.head())
    
def show_tail(df: pd.DataFrame):
    print(df.tail())
    
def get_header(df: pd.DataFrame):
    return list(df.columns)
    
def show_header(df: pd.DataFrame):
    print(get_header(df))
    
def print_sep():
    print(40*"-")

# Import feature data

In [7]:
prj_root = definitions.get_prj_root()
print(prj_root)

C:\Users\salzi\Documents\work\techlabs\bot_hunting_techlabs


In [8]:
path_features = prj_root / "bothunting" / "datasets" / "expanded_dataset"  / "complete_data.csv"
df = pd.read_csv(path_features)

In [20]:
show_head(df)
print_sep()
show_tail(df)

print(len(df))

           id               name      screen_name  statuses_count  \
0  1502026416    TASUKU HAYAKAWA         0918Bask            2177   
1  2492782375              ro_or         1120Roll            2660   
2   293212315           bearclaw        14KBBrown            1254   
3   191839658  pocahontas farida      wadespeters          202968   
4  3020965143           Ms Kathy  191a5bd05da04dc              82   

   followers_count  friends_count  favourites_count  listed_count  \
0              208            332               265             1   
1              330            485              3972             5   
2              166            177              1185             0   
3             2248            981             60304           101   
4               21             79                 5             0   

                      url lang  ... test_set_2 is_protected  \
0                     NaN   ja  ...        0.0         True   
1                     NaN   ja  ...        0

# Filter dataset

In [22]:
def filter_columns(df: pd.DataFrame):
    df = df.copy()
    header = get_header(df)
    idx = header.index("is_protected")
    new_header = ["id"] + header[idx:]
    print_sep()
    print(f"header={header}")
    print(f"new_header={new_header}")
    return df[new_header]
    
show_head(df)
df = filter_columns(df)
print_sep()
show_head(df)

           id               name      screen_name  statuses_count  \
0  1502026416    TASUKU HAYAKAWA         0918Bask            2177   
1  2492782375              ro_or         1120Roll            2660   
2   293212315           bearclaw        14KBBrown            1254   
3   191839658  pocahontas farida      wadespeters          202968   
4  3020965143           Ms Kathy  191a5bd05da04dc              82   

   followers_count  friends_count  favourites_count  listed_count  \
0              208            332               265             1   
1              330            485              3972             5   
2              166            177              1185             0   
3             2248            981             60304           101   
4               21             79                 5             0   

                      url lang  ... test_set_2 is_protected  \
0                     NaN   ja  ...        0.0         True   
1                     NaN   ja  ...        0

In [30]:
def filter_removed_accounts(df: pd.DataFrame):
    df = df.copy()
    return df[~pd.isnull(df["time_of_existence"])]

In [32]:
df = filter_removed_accounts(df)
print(len(df))

8613


In [52]:
df = df.dropna()
print(len(df))

8089


# Build machine learning models

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn import metrics

In [61]:
classifiers = {
    "MPLClassifier": MLPClassifier,
    "KNeighborsClassifier": KNeighborsClassifier,
    "SVC": SVC,
    #"GaussianProcessClassifier": GaussianProcessClassifier,
    # "RBF": RBF,
    #"DecisionTreeClassifier": DecisionTreeClassifier,
    #"RandomForestClassifier": RandomForestClassifier,
    #"AdaBoostClassifier": AdaBoostClassifier,
    #"GaussianNB": GaussianNB,
    #"QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis
}

In [75]:
X = df
X_header = get_header(X)[1:-1]
account_id, X, y = df["id"], df[X_header], df["result"]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

confusion_matrices = {}
precision = {}
recall = {}
f1_score = {}

for clf_name in classifiers:
    print(f"Training and testing classifier {clf_name}")
    clf = classifiers[clf_name]()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    confusion_matrices[clf_name] = metrics.confusion_matrix(y_pred, y_test)
    f1_score[clf_name] = metrics.f1_score(y_test, y_pred, average="weighted")
    precision[clf_name] = metrics.precision_score(y_test, y_pred, average="weighted")
    recall[clf_name] = metrics.recall_score(y_test, y_pred, average="weighted")

for clf_name in classifiers:
    print(f"Classifier: {clf_name}")
    print(f"Confusion matrix: {confusion_matrices[clf_name]}")
    print(f"Precision: {precision[clf_name]}")
    print(f"Recall: {recall[clf_name]}")
    print(f"F1 score: {f1_score[clf_name]}")
    print_sep()

Training and testing classifier MPLClassifier




Training and testing classifier KNeighborsClassifier
Training and testing classifier SVC
Classifier: MPLClassifier
Confusion matrix: [[556  28  54]
 [ 23 318   2]
 [ 41   3 998]]
Precision: 0.9260353158622245
Recall: 0.9253583786455759
F1 score: 0.9256131888259417
----------------------------------------
Classifier: KNeighborsClassifier
Confusion matrix: [[ 549   22   40]
 [  26  325    1]
 [  45    2 1013]]
Precision: 0.9325669275414742
Recall: 0.9327731092436975
F1 score: 0.9326484535175324
----------------------------------------
Classifier: SVC
Confusion matrix: [[545  33 109]
 [ 31 312   2]
 [ 44   4 943]]
Precision: 0.8949157416272661
Recall: 0.8897676717745922
F1 score: 0.8912066735666442
----------------------------------------
