In [4]:
import pandas as pd
import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
# Paths to training and testing data folders
train_folder = "C:\\Users\\grays\\Downloads\\train\\train\\"
test_folder = "C:\\Users\\grays\\Downloads\\test\\test\\"

# List all CSV files in the train folder
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")

# Function to load and concatenate CSV files from a list of file paths
def load_and_concat(files):
    df_list = []
    for file in files:
        df = pd.read_csv(file)
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

# Load and concatenate training and testing data
train_df = load_and_concat(train_files)
test_df = load_and_concat(test_files)

In [6]:
def label_data(df):
    df['label'] = (df['ack_flag_number'] < 0.4).astype(int)
    return df

# Label the training and testing data
train_df = label_data(train_df)
test_df = label_data(test_df)

# Separate features and labels
X_train = train_df.drop(columns=['label'])
y_train = train_df['label']
X_test = test_df.drop(columns=['label'])
y_test = test_df['label']

# Create a scaler object
scaler = MinMaxScaler()

# Fit the scaler to the training features and transform both training and testing features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled features to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [7]:
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled_df, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled_df)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 188870     418]
 [    447 1424447]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    189288
           1       1.00      1.00      1.00   1424894

    accuracy                           1.00   1614182
   macro avg       1.00      1.00      1.00   1614182
weighted avg       1.00      1.00      1.00   1614182



In [9]:
from sklearn.model_selection import StratifiedKFold, cross_validate

stratified_k_fold = StratifiedKFold(n_splits=3)
scoring = ['accuracy', 'precision', 'recall', 'f1']
cv_results = cross_validate(model, X_train_scaled_df, y_train, cv=stratified_k_fold, scoring=scoring, n_jobs=-1)


# Print cross-validation scores
print("Cross-validation accuracy scores:", cv_results['test_accuracy'])
print("Mean cross-validation accuracy:", cv_results.mean())
print("Cross-validation precision scores:", cv_results['test_precision'])
print("Mean cross-validation precision:", cv_results['test_precision'].mean())
print("Cross-validation recall scores:", cv_results['test_recall'])
print("Mean cross-validation recall:", cv_results['test_recall'].mean())
print("Cross-validation F1 scores:", cv_results['test_f1'])
print("Mean cross-validation F1:", cv_results['test_f1'].mean())

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1223, in fit
    X, y = self._validate_data(
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 1273, in check_X_y
    X = check_array(
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 1007, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\_array_api.py", line 746, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 1.60 GiB for an array with shape (4773887, 45) and data type float64

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1223, in fit
    X, y = self._validate_data(
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 1273, in check_X_y
    X = check_array(
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 1007, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\grays\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\_array_api.py", line 746, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 1.60 GiB for an array with shape (4773888, 45) and data type float64
