In [1]:
import pandas as pd

def find_high_outlier_threshold_iqr(data, column):
    """
    Méthode de l'écart interquartile (IQR)
    Seuils de détection des valeurs aberrantes dans les colonnes de données à l'aide de la méthode IQR.
    """
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR

    return upper_bound


def find_outlier_thresholds_iqr(data, column):
    """
    Thresholds for outliers at the low and high ends of the data columns were calculated using the IQR method.
    """
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return lower_bound, upper_bound


def find_outlier_threshold_percentile(data, column, percentile):
    """
    Les seuils de valeurs aberrantes dans les colonnes de données ont été calculés à l'aide de la méthode des percentiles.
    """
    lower_bound = data[column].quantile(percentile / 100.0)

    return lower_bound

In [None]:
data = pd.read_csv('user_twitter_data_complet.csv')
print(len(data))
threshold_aggressiveness = find_high_outlier_threshold_iqr(data,'aggressiveness_score')
threshold_url = find_high_outlier_threshold_iqr(data,'avg_url')
threshold_hashtag = find_high_outlier_threshold_iqr(data,'avg_hashtag')
threshold_ratio_followers_friend = find_outlier_threshold_percentile(data,'ratio_followers_friend',5)
threshold_tweet_length_low, threshold_tweet_length_high = find_outlier_thresholds_iqr(data,'avg_tweet_length')
threshold_tweet_frequency = find_outlier_threshold_percentile(data,'tweet_frequency',95)
threshold_visibility = find_high_outlier_threshold_iqr(data,'visibility')
threshold_account_age = find_outlier_threshold_percentile(data,'account_age',5)
threshold_favorite_rate = find_outlier_threshold_percentile(data,'avg_fav',5)
threshold_group_popularity = find_high_outlier_threshold_iqr(data,'group_popularity')
threshold_mentions_freq = find_high_outlier_threshold_iqr(data,'mentions_freq')

In [6]:
def label_users():
    labels = []
    for _, row in data.iterrows():
        suspect_index = 0
        if row['verified'] == True:
            labels.append(0)
            continue
        if row['aggressiveness_score'] > threshold_aggressiveness:
            suspect_index += 1
        if row['avg_url'] > threshold_url:
            suspect_index += 1
        if row['avg_hashtag'] > threshold_hashtag:
            suspect_index += 1
        if row['ratio_followers_friend'] < threshold_ratio_followers_friend:
            suspect_index += 1
        if row['avg_tweet_length'] < threshold_tweet_length_low or row['avg_tweet_length'] > threshold_tweet_length_high:
            suspect_index += 1
        if row['tweet_frequency'] > threshold_tweet_frequency:
            suspect_index += 1
        if row['visibility'] > threshold_visibility:
            suspect_index += 1
        if row['account_age'] < threshold_favorite_rate:
            suspect_index += 1
        if row['avg_fav'] < threshold_favorite_rate:
            suspect_index += 1
        if row['group_popularity'] > threshold_group_popularity:
            suspect_index += 1
        if row['mentions_freq'] > threshold_mentions_freq:
            suspect_index += 1
        if suspect_index >= 5:
            labels.append(1)
        else:
            labels.append(0)
    return labels


In [None]:
labels = label_users()

data["label"] = labels
data.to_csv("data_labeled_complet.csv", index=False)