# Fraud in Electricity and Gas Consumption #

## Data Cleaning

### Since 2 datasets were provided, we attempt to combine both datasets into 1 on the id columm. ###

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import spearmanr
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import fbeta_score, make_scorer

from sklearn import svm

import lightgbm as lgb

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

seed = 69


In [25]:
invoice_df = pd.read_csv('invoice.csv')
client_df = pd.read_csv('client.csv')

combined_df = pd.merge(client_df, invoice_df, on='id', how='left')



In [26]:
combined_df.head()

Unnamed: 0,region,date_x,dis,id,catg,target,date_y,tarif_type,counter_number,counter_statue,...,reading_remarque,consommation_level_4,old_index,new_index,months_number,counter_type,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3
0,101,31/12/1994,60,train_Client_0,11,0,24/3/2014,11,1335667.0,0,...,8,0,14302,14384,4,ELEC,1,82,0,0
1,101,31/12/1994,60,train_Client_0,11,0,29/3/2013,11,1335667.0,0,...,6,0,12294,13678,4,ELEC,1,1200,184,0
2,101,31/12/1994,60,train_Client_0,11,0,23/3/2015,11,1335667.0,0,...,8,0,14624,14747,4,ELEC,1,123,0,0
3,101,31/12/1994,60,train_Client_0,11,0,13/7/2015,11,1335667.0,0,...,8,0,14747,14849,4,ELEC,1,102,0,0
4,101,31/12/1994,60,train_Client_0,11,0,17/11/2016,11,1335667.0,0,...,9,0,15066,15638,12,ELEC,1,572,0,0


In [27]:
combined_df.describe()


Unnamed: 0,region,dis,catg,target,tarif_type,counter_number,counter_statue,counter_code,reading_remarque,consommation_level_4,old_index,new_index,months_number,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3
count,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0,500651.0
mean,204.746922,63.519156,11.353871,0.062644,16.108279,195103400000.0,0.050217,204.390755,7.46371,64.39315,15759.69,16390.37,22.744289,1.000154,443.065463,120.508706,28.196772
std,104.620488,3.38872,3.66142,0.242323,11.145881,2071552000000.0,0.396153,121.204514,1.374409,1230.465569,29757.33,30537.07,1670.624818,0.04715,592.249623,1396.817086,214.020756
min,101.0,60.0,11.0,0.0,9.0,0.0,0.0,5.0,6.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,101.0,62.0,11.0,0.0,11.0,147722.0,0.0,202.0,6.0,0.0,1799.0,2165.0,4.0,1.0,99.0,0.0,0.0
50%,107.0,62.0,11.0,0.0,11.0,485701.0,0.0,203.0,8.0,0.0,7876.0,8438.0,4.0,1.0,321.0,0.0,0.0
75%,307.0,69.0,11.0,0.0,11.0,1008740.0,0.0,207.0,9.0,0.0,20927.5,21645.0,4.0,1.0,661.0,0.0,0.0
max,399.0,69.0,51.0,1.0,45.0,27400000000000.0,5.0,600.0,9.0,343568.0,2800280.0,2870972.0,231602.0,20.0,98889.0,819886.0,45360.0


In [28]:
combined_df.head()


Unnamed: 0,region,date_x,dis,id,catg,target,date_y,tarif_type,counter_number,counter_statue,...,reading_remarque,consommation_level_4,old_index,new_index,months_number,counter_type,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3
0,101,31/12/1994,60,train_Client_0,11,0,24/3/2014,11,1335667.0,0,...,8,0,14302,14384,4,ELEC,1,82,0,0
1,101,31/12/1994,60,train_Client_0,11,0,29/3/2013,11,1335667.0,0,...,6,0,12294,13678,4,ELEC,1,1200,184,0
2,101,31/12/1994,60,train_Client_0,11,0,23/3/2015,11,1335667.0,0,...,8,0,14624,14747,4,ELEC,1,123,0,0
3,101,31/12/1994,60,train_Client_0,11,0,13/7/2015,11,1335667.0,0,...,8,0,14747,14849,4,ELEC,1,102,0,0
4,101,31/12/1994,60,train_Client_0,11,0,17/11/2016,11,1335667.0,0,...,9,0,15066,15638,12,ELEC,1,572,0,0


In [29]:
print(f"Length of dataset: {len(combined_df)}")
print(f"Number of datapoints in each column: \n{combined_df.count()} \n")

Length of dataset: 500651
Number of datapoints in each column: 
region                  500651
date_x                  500651
dis                     500651
id                      500651
catg                    500651
target                  500651
date_y                  500651
tarif_type              500651
counter_number          500651
counter_statue          500651
counter_code            500651
reading_remarque        500651
consommation_level_4    500651
old_index               500651
new_index               500651
months_number           500651
counter_type            500651
counter_coefficient     500651
consommation_level_1    500651
consommation_level_2    500651
consommation_level_3    500651
dtype: int64 



In [30]:
number_of_fraud = sum(combined_df["target"] == 1)
print(number_of_fraud)
print(f"proportion of fraud: {number_of_fraud/len(combined_df)}")

31363
proportion of fraud: 0.06264443694310008


#### We have created 4 new variables, described as such:
##### delta_start_invoice: diff between join and transaction date
##### delta_index: diff between old and new index
##### delta_transactions: diff between transactions over the same client
##### consommation_sum: sum of consommation levels

In [31]:

dates = {'join_date': combined_df['date_x'], 'transaction_date': combined_df['date_y'], 'id': combined_df['id']}
dates_df = pd.DataFrame(dates)

dates_df['join_date'] = pd.to_datetime(dates_df['join_date'] , format='%d/%m/%Y')
dates_df['transaction_date'] = pd.to_datetime(dates_df['transaction_date'], format='%d/%m/%Y')

# Calculate the difference in days between transaction and join date
dates_df['delta_start_invoice'] = (dates_df['transaction_date']- dates_df['join_date']).dt.days

# Create new delta_transactions (diff between transaction dates for each client)
dates_df = dates_df.sort_values(['id', 'delta_start_invoice'])
dates_df['delta_transactions'] = dates_df.groupby('id')['delta_start_invoice'].diff().fillna(0)

dates_df.head()



Unnamed: 0,join_date,transaction_date,id,delta_start_invoice,delta_transactions
22,1994-12-31,2005-10-17,train_Client_0,3943,0.0
23,1994-12-31,2006-02-24,train_Client_0,4073,130.0
24,1994-12-31,2006-06-23,train_Client_0,4192,119.0
25,1994-12-31,2006-10-18,train_Client_0,4309,117.0
28,1994-12-31,2007-02-26,train_Client_0,4440,131.0


#### add new delta_start_invoice, delta_index and consommation_sum to combined_df

In [32]:
combined_df['delta_index'] = combined_df['new_index'] - combined_df['old_index']
combined_df['delta_start_invoice'] = dates_df['delta_start_invoice']
combined_df['delta_transactions'] = dates_df['delta_transactions']

#### new dataframe for one-hot encoding categorical variables (dis, catg, region, tarif_type, counter_statue, counter_code, reading_remarque, counter_type)

In [33]:
categorical_vars = ['dis', 'catg', 'region', 'tarif_type', 'counter_statue', 'counter_code', 'counter_type']
categorical_df = pd.get_dummies(combined_df, columns=categorical_vars, prefix=categorical_vars)
categorical_df = categorical_df.groupby('id').agg({col: 'max' for col in categorical_df.columns if col != 'id'})



#### Agg function to group the transactions with each client

In [34]:
stats = ['sum', 'mean', 'median', 'std']

selected_columns = ['consommation_level_1', 
                    'consommation_level_2', 'consommation_level_3', 'consommation_level_4',
                    'delta_index', 'delta_start_invoice', 'id', 'reading_remarque']

# Create a new dataframe with the desired aggregate functions
numerical_df = combined_df[selected_columns].groupby('id').agg(stats)

numerical_df.head()




Unnamed: 0_level_0,consommation_level_1,consommation_level_1,consommation_level_1,consommation_level_1,consommation_level_2,consommation_level_2,consommation_level_2,consommation_level_2,consommation_level_3,consommation_level_3,...,delta_index,delta_index,delta_start_invoice,delta_start_invoice,delta_start_invoice,delta_start_invoice,reading_remarque,reading_remarque,reading_remarque,reading_remarque
Unnamed: 0_level_1,sum,mean,median,std,sum,mean,median,std,sum,mean,...,median,std,sum,mean,median,std,sum,mean,median,std
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
train_Client_0,12334,352.4,267.0,310.343472,370,10.571429,0.0,43.568935,0,0.0,...,267.0,341.55393,213142,6089.771429,6047.0,1358.574709,244,6.971429,6.0,1.248192
train_Client_1,20629,557.540541,520.0,197.93596,0,0.0,0.0,0.0,0,0.0,...,520.0,197.93596,132603,3583.864865,3509.0,1457.748762,267,7.216216,6.0,1.377097
train_Client_10,14375,798.611111,655.5,513.841374,682,37.888889,0.0,160.748942,0,0.0,...,655.5,646.808386,165982,9221.222222,8678.0,1526.789733,127,7.055556,6.0,1.258955
train_Client_100,24,1.2,0.0,3.607011,0,0.0,0.0,0.0,0,0.0,...,0.0,3.607011,91275,4563.75,4545.5,774.520692,123,6.15,6.0,0.67082
train_Client_1000,9292,663.714286,770.0,224.831365,1468,104.857143,0.0,167.15532,1643,117.357143,...,770.0,633.485669,13497,964.071429,1010.0,506.611437,124,8.857143,9.0,0.363137


#### Combining numerical and cat dataframes

In [35]:
to_drop = ['region', 'date_x', 'dis', 'id', 'catg', 'target', 'date_y', 'tarif_type', 'counter_number', 
           'counter_statue', 'counter_code', 'reading_remarque', 'consommation_level_4', 'old_index',
           'new_index', 'months_number', 'counter_type', 'counter_coefficient', 'consommation_level_1',
           'consommation_level_2', 'consommation_level_3']

client_summary = pd.concat([numerical_df, categorical_df], axis=1)

# Identify existing columns in the DataFrame
existing_columns = [col for col in to_drop if col in client_summary.columns]

# Drop existing columns from the DataFrame
client_summary = client_summary.drop(columns=existing_columns)

client_summary.info()




<class 'pandas.core.frame.DataFrame'>
Index: 21652 entries, train_Client_0 to train_Client_128438
Columns: 116 entries, ('consommation_level_1', 'sum') to counter_type_GAZ
dtypes: bool(85), float64(22), int64(9)
memory usage: 7.0+ MB


#### Add y variable

In [36]:
client_summary['target'] = combined_df.groupby('id')['target'].apply(lambda x: 1 if x.any() else 0)
client_summary['target'].value_counts()


target
0    20576
1     1076
Name: count, dtype: int64

In [37]:
combined_df.groupby('id')['target'].apply(lambda x: 1 if x.any() else 0).value_counts()

target
0    20576
1     1076
Name: count, dtype: int64

## Data Balancing

#### Due to the low proportion of fraud cases, we performed synthetic oversampling of fraud cases with SMOTE and undersampled non-fraud cases with Tomek's link with three different methods:
#### 1) SMOTE + Tomek's Link

In [44]:
# Define the oversampling strategy using SMOTE and Tomek
smote = SMOTE(sampling_strategy='auto')

smote = SMOTE(random_state=seed)
tomek = TomekLinks()

X = client_summary.drop('target', axis=1)

# Flatten multi-level column names
X.columns = [''.join(map(str, col)).strip() for col in X.columns.to_flat_index()]

y = client_summary['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= seed)

print(X_train[X_train.isnull().any(axis=1)])


# under and oversampling of data using Tomek and SMOTE
X_train1, y_train1 = tomek.fit_resample(X_train, y_train)
X_train1_resampled, y_train1_resampled = smote.fit_resample(X_train1, y_train1)


# Standardize the data separately to prevent leakage
scaler = StandardScaler()
X_train1_standardized = pd.DataFrame(scaler.fit_transform(X_train1_resampled), columns=X_train1_resampled.columns)
X_test_standardized = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Norminalise the data
scaler_minmax = MinMaxScaler()
X_train1_norminalized = pd.DataFrame(scaler_minmax.fit_transform(X_train1_resampled), columns=X_train1_resampled.columns)
# X_test_norminalized = pd.DataFrame(scaler_minmax.transform(X_test), columns=X_test.columns)



Empty DataFrame
Columns: [consommation_level_1sum, consommation_level_1mean, consommation_level_1median, consommation_level_1std, consommation_level_2sum, consommation_level_2mean, consommation_level_2median, consommation_level_2std, consommation_level_3sum, consommation_level_3mean, consommation_level_3median, consommation_level_3std, consommation_level_4sum, consommation_level_4mean, consommation_level_4median, consommation_level_4std, delta_indexsum, delta_indexmean, delta_indexmedian, delta_indexstd, delta_start_invoicesum, delta_start_invoicemean, delta_start_invoicemedian, delta_start_invoicestd, reading_remarquesum, reading_remarquemean, reading_remarquemedian, reading_remarquestd, delta_index, delta_start_invoice, delta_transactions, dis_60, dis_62, dis_63, dis_69, catg_11, catg_12, catg_51, region_101, region_103, region_104, region_105, region_106, region_107, region_206, region_301, region_302, region_303, region_304, region_305, region_306, region_307, region_308, region_30

#### 2) SMOTE only

In [45]:
# Define the oversampling strategy using SMOTE only
smote = SMOTE(sampling_strategy='auto')
smote = SMOTE(random_state=seed)

X = client_summary.drop('target', axis=1)

# Flatten multi-level column names
X.columns = [''.join(map(str, col)).strip() for col in X.columns.to_flat_index()]

y = client_summary['target']

X_train2, X_test, y_train2, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Oversampling of data using SMOTE
X_train2, y_train2 = smote.fit_resample(X_train2, y_train2)

# Standardize the data separately to prevent leakage
scaler = StandardScaler()
X_train2_standardized = pd.DataFrame(scaler.fit_transform(X_train2), columns=X_train2.columns)
X_test_standardized = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

#### 3) Tomek's Link only

In [46]:
# Define the undersampling strategy using Tomek's Link only
tomek = TomekLinks()

X = client_summary.drop('target', axis=1)

# Flatten multi-level column names
X.columns = [''.join(map(str, col)).strip() for col in X.columns.to_flat_index()]

y = client_summary['target']

X_train3, X_test, y_train3, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Undersampling of data using Tomek Links
X_train3, y_train3 = tomek.fit_resample(X_train3, y_train3)

# Standardize the data separately to prevent leakage
scaler = StandardScaler()
X_train3_standardized = pd.DataFrame(scaler.fit_transform(X_train3), columns=X_train3.columns)
X_test_standardized = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [47]:
y_train.value_counts()

target
0    16461
1      860
Name: count, dtype: int64

In [48]:
y_test.value_counts()


target
0    4115
1     216
Name: count, dtype: int64

In [49]:
# SMOTE + Tomak's Link
y_train1_resampled.value_counts()

target
0    16111
1    16111
Name: count, dtype: int64

In [50]:
# SMOTE
y_train2.value_counts()

target
0    16461
1    16461
Name: count, dtype: int64

In [51]:
# Tomak's link
y_train3.value_counts()


target
0    16111
1      860
Name: count, dtype: int64

In [52]:
# Change Bool to int
X_train1_resampled = X_train1_resampled.astype({col: int for col in X_train1_resampled.select_dtypes(include='bool').columns})
X_train2 = X_train2.astype({col: int for col in X_train2.select_dtypes(include='bool').columns})
X_train3 = X_train3.astype({col: int for col in X_train3.select_dtypes(include='bool').columns})

In [53]:
# top 20 vars with highest correlation
corr_matrix_std = X_train1_standardized.corrwith(y_train1_resampled, method='spearman')

top_20_vars = corr_matrix_std.abs().sort_values(ascending=False).head(20).index.tolist()

print(top_20_vars)

['consommation_level_3sum', 'consommation_level_3mean', 'consommation_level_3std', 'consommation_level_2sum', 'consommation_level_2std', 'consommation_level_2mean', 'consommation_level_4sum', 'consommation_level_4std', 'consommation_level_4mean', 'delta_index', 'delta_indexsum', 'delta_indexstd', 'dis_69', 'consommation_level_1sum', 'delta_start_invoicestd', 'reading_remarquesum', 'consommation_level_1std', 'counter_statue_5', 'counter_code_203', 'counter_code_413']


  return spearmanr(a, b)[0]


In [55]:
# Picking 20 vars to keep (with lowest collinearity)

# Create a subset of the dataset with only the top 20 variables
X_top20 = X_train1_standardized[top_20_vars]

# Remove constant columns from X_top20
X_top20 = X_top20.loc[:, (X_top20 != X_top20.iloc[0]).any()]

# Calculate correlation matrix between top 20 variables
new_corr_matrix_std = X_top20.corr(method='spearman')

# Convert the correlation matrix to a DataFrame
corr_df = new_corr_matrix_std.reset_index().melt('index', var_name='Variable2', value_name='Correlation')
corr_df.columns = ['Variable1', 'Variable2', 'Correlation']

# Remove self-correlations (diagonal elements)
corr_df = corr_df[corr_df['Variable1'] != corr_df['Variable2']]

corr_df['Correlation'] = corr_df['Correlation'].abs()

# Rank the pairs based on correlation (descending)
corr_df = corr_df.sort_values(by='Correlation', ascending=True).reset_index(drop=True)

# Add a ranking column
corr_df.insert(0, 'Rank', range(1, len(corr_df)+1))

# Print the table
print(corr_df.head(40))

    Rank                 Variable1                 Variable2  Correlation
0      1          counter_statue_5                    dis_69     0.001869
1      2                    dis_69          counter_statue_5     0.001869
2      3          counter_code_413       reading_remarquesum     0.002269
3      4       reading_remarquesum          counter_code_413     0.002269
4      5          counter_statue_5          counter_code_413     0.007266
5      6          counter_code_413          counter_statue_5     0.007266
6      7    delta_start_invoicestd  consommation_level_4mean     0.014489
7      8  consommation_level_4mean    delta_start_invoicestd     0.014489
8      9            delta_indexsum          counter_code_203     0.014550
9     10          counter_code_203            delta_indexsum     0.014550
10    11                    dis_69       reading_remarquesum     0.017995
11    12       reading_remarquesum                    dis_69     0.017995
12    13   consommation_level_4std    

In [56]:
# From the correlation matrix, we pick the top 10 vars with lowest collinearity for the baseline logistic regression model
# using method 1 (SMOTE + Tomak's Link)

top_10_vars = ['counter_statue_5', 'dis_69', 'counter_code_413', 'reading_remarquesum', 'delta_start_invoicestd',
'consommation_level_4mean', 'delta_indexsum', 'counter_code_203', 'consommation_level_4std', 'consommation_level_4sum']

X_train1_standardized_resampled_important = X_train1_standardized[top_10_vars]
X_train1_norminalized_resampled_important = X_train1_norminalized[top_10_vars]
X_train2_standardized_important = X_train2_standardized[top_10_vars]
X_train3_standardized_important = X_train3_standardized[top_10_vars]

### Run Logistic Regression

In [57]:
print(f"Training std dataset shape (method 1): {X_train1_standardized_resampled_important.shape}")
print(f"Training nom dataset shape (method 1): {X_train1_norminalized_resampled_important.shape}")
print(f"Training std dataset shape (method 2): {X_train2_standardized_important.shape}")
print(f"Training std dataset shape (method 3): {X_train3_standardized_important.shape}")


Training std dataset shape (method 1): (32222, 10)
Training nom dataset shape (method 1): (32222, 10)
Training std dataset shape (method 2): (32922, 10)
Training std dataset shape (method 3): (16971, 10)


In [58]:
# Comparing performance of standardized and nominalized datasets for method 1 (SMOTE + Tomak's Link)

# Initialize logistic regression model
model_LR = LogisticRegression()

# Perform cross-validation (std)
cross_val_results_std = cross_val_score(model_LR, X_train1_standardized_resampled_important, y_train1_resampled, cv=5, scoring='f1')

# Print cross-validation F1 score (std)
print("Cross-Validation F1 Scores (method 1) (std):", cross_val_results_std)
print("Mean F1 Score (method 1) (std):", cross_val_results_std.mean())

# Perform cross-validation (nom)
cross_val_results_nom = cross_val_score(model_LR, X_train1_norminalized_resampled_important, y_train1_resampled, cv=5, scoring='f1')

# Print cross-validation F1 score (nom)
print("Cross-Validation F1 Scores (method 1) (nom):", cross_val_results_nom)
print("Mean F1 Score (method 1) (nom):", cross_val_results_nom.mean())

# We pick standardising over norminalizing given its slightly better mean F1 score


Cross-Validation F1 Scores (method 1) (std): [0.69987021 0.75429975 0.75460123 0.7496136  0.74361342]
Mean F1 Score (method 1) (std): 0.74039964323809
Cross-Validation F1 Scores (method 1) (nom): [0.6913178  0.74888787 0.74911552 0.7442866  0.73904997]
Mean F1 Score (method 1) (nom): 0.7345315513896734


In [59]:
# Now, running logistic regression on standardised datasets for method 2 (SMOTE only) and method 3 (Tomak's Link only)

# Perform cross-validation (method 2)
cross_val_results_m2 = cross_val_score(model_LR, X_train2_standardized_important, y_train2, cv=5, scoring='f1')

# Print cross-validation F1 score (method 2)
print("Cross-Validation F1 Scores (method 2) (std):", cross_val_results_m2)
print("Mean F1 Score (method 2) (std):", cross_val_results_m2.mean())

# Perform cross-validation (method 3)
cross_val_results_m3 = cross_val_score(model_LR, X_train3_standardized_important, y_train3, cv=5, scoring='f1')

# Print cross-validation F1 score (method 2)
print("Cross-Validation F1 Scores (method 3) (std):", cross_val_results_m3)
print("Mean F1 Score (method 3) (std):", cross_val_results_m3.mean())

Cross-Validation F1 Scores (method 2) (std): [0.69572158 0.75192047 0.75045154 0.7496977  0.74861007]
Mean F1 Score (method 2) (std): 0.7392802717960579
Cross-Validation F1 Scores (method 3) (std): [0. 0. 0. 0. 0.]
Mean F1 Score (method 3) (std): 0.0


### Using SVM 

In [60]:
# Create an SVM classifier
SVM = svm.SVC(kernel='linear')

def f2_score(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=2)

# Make scorer
f2_scorer = make_scorer(f2_score)

cross_val_results_m2_svm = cross_val_score(SVM, X_train2_standardized_important, y_train2, cv=5, scoring=f2_scorer)
print("Cross-Validation F2 Scores:", cross_val_results_m2_svm)
print("Mean F2 Score:", cross_val_results_m2_svm.mean())

Cross-Validation F2 Scores: [0.60003846 0.68719042 0.68580742 0.68055905 0.68334064]
Mean F2 Score: 0.6673871992387312


### Using GBM

In [63]:
# Create a LightGBM classifier
GBM = lgb.LGBMClassifier()

cross_val_results_m2_gbm = cross_val_score(GBM, X_train2_standardized_important, y_train2, cv=5, scoring=f2_scorer)
print("Cross-Validation F2 Scores:", cross_val_results_m2_gbm)
print("Mean F2 Score:", cross_val_results_m2_gbm.mean())

[LightGBM] [Info] Number of positive: 13169, number of negative: 13168
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1542
[LightGBM] [Info] Number of data points in the train set: 26337, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500019 -> initscore=0.000076
[LightGBM] [Info] Start training from score 0.000076
[LightGBM] [Info] Number of positive: 13168, number of negative: 13169
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000816 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1542
[LightGBM] [Info] Number of data points in the train set: 26337, number of used features: 10
[LightGBM] [Info] [b

### Using Random Forest

In [65]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest classifier
RF = RandomForestClassifier(n_estimators=100)

cross_val_results_m2_rf = cross_val_score(RF, X_train2_standardized_important, y_train2, cv=5, scoring=f2_scorer)
print("Cross-Validation F2 Scores:", cross_val_results_m2_rf)
print("Mean F2 Score:", cross_val_results_m2_rf.mean())

Cross-Validation F2 Scores: [0.85019751 0.904839   0.90422975 0.90533253 0.90342306]
Mean F2 Score: 0.8936043702526089


### Using ADA boost

In [67]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Create an AdaBoost classifier with Decision Tree as the base estimator
ADA = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=200)

cross_val_results_m2_ada = cross_val_score(ADA, X_train2_standardized_important, y_train2, cv=5, scoring=f2_scorer)
print("Cross-Validation F2 Scores:", cross_val_results_m2_ada)
print("Mean F2 Score:", cross_val_results_m2_ada.mean())

TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'



# Assuming you have the true labels (y_true) and predicted labels (y_pred)
f2_score = fbeta_score(y_true, y_pred, beta=2)