In [1]:
import pandas as pd
import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Grayson's path
#train_folder = "C:\\Users\\grays\\Downloads\\train\\train\\"
#test_folder = "C:\\Users\\grays\\Downloads\\test\\test\\"

# Will's path
train_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\train\\"
test_folder = "C:\\Users\\willg\\OneDrive\\CSCI\\summer-2024-work\\will-grayson-ML\\test\\"

# List all CSV files in the train folder
train_files = glob.glob(train_folder + "*.csv")
test_files = glob.glob(test_folder + "*.csv")

# Function to load and concatenate CSV files from a list of file paths
def load_and_concat(files):
    df_list = []
    for file in files:
        df = pd.read_csv(file)
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

# Load and concatenate training and testing data
train_df = load_and_concat(train_files)
test_df = load_and_concat(test_files)

In [3]:
def label_data(df):
    df['label'] = (df['ack_flag_number'] < 0.4).astype(int)
    return df

# Label the training and testing data
train_df = label_data(train_df)
test_df = label_data(test_df)

# Separate features and labels
X_train = train_df.drop(columns=['label'])
y_train = train_df['label']
X_test = test_df.drop(columns=['label'])
y_test = test_df['label']

# Create a scaler object
scaler = MinMaxScaler()

# Fit the scaler to the training features and transform both training and testing features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled features to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [4]:
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled_df, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled_df)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 188870     418]
 [    447 1424447]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    189288
           1       1.00      1.00      1.00   1424894

    accuracy                           1.00   1614182
   macro avg       1.00      1.00      1.00   1614182
weighted avg       1.00      1.00      1.00   1614182



In [6]:
from sklearn.model_selection import StratifiedKFold, cross_validate

stratified_k_fold = StratifiedKFold(n_splits=3)
scoring = ['accuracy', 'precision', 'recall', 'f1']
cv_results = cross_validate(model, X_train_scaled_df, y_train, cv=stratified_k_fold, scoring=scoring, n_jobs=-1)


# Print cross-validation scores
print("Cross-validation accuracy scores:", cv_results['test_accuracy'])
print("Mean cross-validation accuracy:", cv_results['test_accuracy'].mean())
print("Cross-validation precision scores:", cv_results['test_precision'])
print("Mean cross-validation precision:", cv_results['test_precision'].mean())
print("Cross-validation recall scores:", cv_results['test_recall'])
print("Mean cross-validation recall:", cv_results['test_recall'].mean())
print("Cross-validation F1 scores:", cv_results['test_f1'])
print("Mean cross-validation F1:", cv_results['test_f1'].mean())

Cross-validation accuracy scores: [0.99649049 0.99950858 0.9900559 ]
Mean cross-validation accuracy: 0.9953516560723531
Cross-validation precision scores: [0.99998774 0.99954676 0.98915761]
Mean cross-validation precision: 0.996230703526137
Cross-validation recall scores: [0.99607633 0.99990227 0.99980689]
Mean cross-validation recall: 0.9985951664025977
Cross-validation F1 scores: [0.9980282  0.99972449 0.99445374]
Mean cross-validation F1: 0.9974021439575358
