In [154]:
%%writefile /content/drive/MyDrive/data_mining/notebooks/feature_selection.py
# importing modules
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

def import_files(data_path):
    """
    Import the files and join the features with the labels to obtain a single dataframe
    """
    users = os.path.join(data_path, 'users')
    users_features = os.path.join(data_path, 'users_features')

    coded_ids = pd.read_csv(os.path.join(users,'coded_ids.csv')).set_index('coded_id')
    coded_ids_labels_train = pd.read_csv(os.path.join(users,'coded_ids_labels_train.csv')).set_index('coded_id')
    coded_ids = coded_ids.join(coded_ids_labels_train)
    coded_ids.reset_index(inplace=True)
    coded_ids.set_index('user_id', inplace=True)

    features = pd.read_csv(os.path.join(users_features, 'features.csv')).set_index('user_id')
    # features_names = pd.read_csv(os.path.join(users_features, 'features_names.txt'), header=None)

    data = features.join(coded_ids)
    data.reset_index(inplace=True, drop=True)
    data.set_index('coded_id', inplace=True)
    data.sort_index(inplace=True)

    return data

def get_clean_data(data_path):
    """
    Clean the dataset by:

    1. encode categorical variable
    2. remove unnecessary features
    3. fill null values
    4. convert datetime to timestamp
    """
    data = import_files(data_path)

    encoder = LabelEncoder()
    data['lang'] = encoder.fit_transform(data['lang'])
    data['time_zone'] = encoder.fit_transform(data['time_zone'].astype(str))
    data['date_newest_tweet'] = pd.to_datetime(data['date_newest_tweet']).astype('int64') // 10 ** 9
    data['date_oldest_tweet'] = pd.to_datetime(data['date_oldest_tweet']).astype('int64') // 10 ** 9
    data['utc_offset'] = data['utc_offset'].fillna(0)

    cols_to_remove = ['avg_intertweet_times',
                      'max_intertweet_times',
                      'min_intertweet_times',
                      'std_intertweet_times',
                      'followers_count_minus_2002',
                      'friends_count_minus_2002',
                      'spam_in_screen_name']
    data.drop(cols_to_remove, axis=1, inplace=True)

    return data

def get_feature_groups(data):
    """
    Create different feature groups based on the correlation score with the target variable

    1. all columns
    2. columns with correlation score > 0.2
    3. columns with correlation score > 0.3
    4. top 30 columns with chi square score
    5. top 50 columns with chi square score
    6. top 80 columns with chi square score
    """
    groups = []
    groups.append(data.columns)
    groups.append(data.corr()[data.corr().label.abs() > 0.2].label.index.values)
    groups.append(data.corr()[data.corr().label.abs() > 0.3].label.index.values)
    
    scaler = MinMaxScaler()
    data_new = scaler.fit_transform(data[data.label.notnull()].drop('label', axis=1))
    selector = SelectKBest(chi2, k=30)
    selector.fit(data_new, data[data.label.notnull()]['label'])
    indices =[i for i, x in enumerate(selector.get_support()) if x]
    group = [data.columns[i] for i in indices]
    group.append('label')
    groups.append(group)

    selector = SelectKBest(chi2, k=50)
    selector.fit(data_new, data[data.label.notnull()]['label'])
    indices =[i for i, x in enumerate(selector.get_support()) if x]
    group = [data.columns[i] for i in indices]
    group.append('label')
    groups.append(group)

    selector = SelectKBest(chi2, k=80)
    selector.fit(data_new, data[data.label.notnull()]['label'])
    indices =[i for i, x in enumerate(selector.get_support()) if x]
    group = [data.columns[i] for i in indices]
    group.append('label')
    groups.append(group)

    return groups

def get_train_and_val_set(data_path, return_full_set=False):
    """
    Split the dataset into train set, validation set, and test set for each feature group
    """
    data = get_clean_data(data_path)
    groups = get_feature_groups(data)

    train_set = []
    train_val_set = []
    for columns in groups:

        train = data[columns][data.label.notnull()].copy()
        train_set.append(train)
        test = data[columns][data.label.isnull()].copy()
        train, val = train_test_split(train, test_size=86, random_state=101)

        X_train, y_train = train.drop('label', axis=1), train['label']
        X_val, y_val = val.drop('label', axis=1), val['label']

        train_val_set.append((X_train, y_train, X_val, y_val))

    if return_full_set:
        return train_val_set, train_set
    else:
        return train_val_set

if __name__ == '__main__':
    data_path = '/content/drive/MyDrive/data_mining/Social_spammers_dataset'
    t_v_set = get_train_and_val_set(data_path)

Overwriting /content/drive/MyDrive/data_mining/notebooks/feature_selection.py


In [155]:
!python /content/drive/MyDrive/data_mining/notebooks/feature_selection.py

In [176]:
%%writefile /content/drive/MyDrive/data_mining/notebooks/models_with_crossval.py
# importing modules
import pandas as pd
import numpy as np
from sklearn import set_config
from feature_selection import get_train_and_val_set
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, ShuffleSplit

set_config(print_changed_only=True)

def get_crossval_scores(data_path, to_print=False):
    """
    Perform cross validation on the dataset and fit on different models
    """

    models = [KNeighborsClassifier(n_neighbors=5),
              DecisionTreeClassifier(),
              RandomForestClassifier(n_estimators=101),
              RandomForestClassifier(n_estimators=200),
              GradientBoostingClassifier(n_estimators=101),
              GradientBoostingClassifier(n_estimators=200)]
    _, train_set = get_train_and_val_set(data_path, return_full_set=True)

    scores = {}
    for model in models:
        scores[model] = {}

    for model in models:
        if to_print:
            print(model)
        for i, train in enumerate(train_set, 1):
            X, y = train.drop('label', axis=1), train['label']
            cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
            score = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
            scores[model][f'group {i}'] = score
            if to_print:
                print('---- feature group', i, ':', score, 'Mean:', np.mean(score))
        print()

    return scores

if __name__ == '__main__':
    data_path = '/content/drive/MyDrive/data_mining/Social_spammers_dataset'
    scores = get_crossval_scores(data_path, to_print=True)
    pd.DataFrame(scores).to_pickle('/content/drive/MyDrive/data_mining/notebooks/crossval_scores.pkl')

Overwriting /content/drive/MyDrive/data_mining/notebooks/models_with_crossval.py


In [177]:
!python /content/drive/MyDrive/data_mining/notebooks/models_with_crossval.py

KNeighborsClassifier()
---- feature group 1 : [0.91860465 0.86046512 0.90116279 0.86627907 0.85465116] Mean: 0.8802325581395349
---- feature group 2 : [0.91860465 0.86046512 0.90116279 0.86627907 0.85465116] Mean: 0.8802325581395349
---- feature group 3 : [0.87790698 0.84302326 0.90116279 0.83139535 0.84883721] Mean: 0.8604651162790697
---- feature group 4 : [0.8372093  0.81395349 0.83139535 0.86627907 0.84883721] Mean: 0.8395348837209301
---- feature group 5 : [0.8255814  0.80813953 0.84302326 0.86627907 0.86046512] Mean: 0.8406976744186047
---- feature group 6 : [0.90116279 0.87790698 0.87790698 0.86627907 0.84883721] Mean: 0.8744186046511627

DecisionTreeClassifier()
---- feature group 1 : [0.95348837 0.93604651 0.94186047 0.97674419 0.95348837] Mean: 0.9523255813953488
---- feature group 2 : [0.96511628 0.93023256 0.98255814 0.97093023 0.94767442] Mean: 0.9593023255813954
---- feature group 3 : [0.97674419 0.90697674 0.96511628 0.94186047 0.93604651] Mean: 0.9453488372093023
---- f

In [47]:
%%writefile /content/drive/MyDrive/data_mining/notebooks/ml_models.py
# importing modules
from feature_selection import get_train_and_val_set
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

def train(model, X_train, y_train):
    """
    Fit the model and return the train score
    """
    model.fit(X_train, y_train)
    score = model.score(X_train, y_train)

    return score

def evaluate(model, X_val, y_val):
    """
    Evaluate the model on the validation set and return the val score
    """
    score = model.score(X_val, y_val)

    return score

def predict(model, X_test):
    """
    Predict the labels for the test set
    """
    preds = model.predict(X_test)
    
    return preds

data_path = '/content/drive/MyDrive/data_mining/Social_spammers_dataset'
models = [DecisionTreeClassifier(),
          RandomForestClassifier(),
          GradientBoostingClassifier()]
train_val_set = get_train_and_val_set(data_path)

scores = {}
for model in models:
    scores[type(model).__name__] = {'train_scores': [], 'val_scores': []}

for model in models:
    print(type(model).__name__)
    for i, (X_train, y_train, X_val, y_val) in enumerate(train_val_set, 1):
        score = train(model, X_train, y_train)
        scores[type(model).__name__]['train_scores'].append(score)
        print('---- train score', i, ':', score)

        score = evaluate(model, X_val, y_val)
        scores[type(model).__name__]['val_scores'].append(score)
        print('---- val score', i, ':', score)
        print()


Overwriting /content/drive/MyDrive/data_mining/notebooks/ml_models.py


In [48]:
!python /content/drive/MyDrive/data_mining/notebooks/ml_models.py

DecisionTreeClassifier
---- train score 1 : 1.0
---- val score 1 : 0.9302325581395349

---- train score 2 : 1.0
---- val score 2 : 0.9534883720930233

---- train score 3 : 1.0
---- val score 3 : 0.9534883720930233

---- train score 4 : 1.0
---- val score 4 : 0.9651162790697675

RandomForestClassifier
---- train score 1 : 1.0
---- val score 1 : 0.9767441860465116

---- train score 2 : 1.0
---- val score 2 : 0.9767441860465116

---- train score 3 : 1.0
---- val score 3 : 0.9767441860465116

---- train score 4 : 1.0
---- val score 4 : 0.9651162790697675

GradientBoostingClassifier
---- train score 1 : 1.0
---- val score 1 : 0.9883720930232558

---- train score 2 : 1.0
---- val score 2 : 0.9767441860465116

---- train score 3 : 1.0
---- val score 3 : 0.9767441860465116

---- train score 4 : 1.0
---- val score 4 : 0.9651162790697675



In [163]:
from feature_selection import get_train_and_val_set

ModuleNotFoundError: ignored

In [162]:
!cd /content/drive/MyDrive/data_mining/notebooks/