In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [16]:
test_df = pd.read_csv('test_df.csv')

In [17]:
train_df = pd.read_csv('train_df.csv')

In [18]:
test_df.head()

Unnamed: 0,sample,AF1,AF2,AF7,AF8,AFZ,C1,C2,C3,C4,...,TP7,TP8,X,Y,nd,subject,alcoholic,match,err,object
0,0,-2.797,-3.448,-0.305,-4.089,-2.797,0.315,1.088,0.651,2.37,...,-8.86,-0.071,-5.544,-4.873,-4.089,co2a0000377,True,nomatch,False,False
1,1,-2.309,-3.937,1.16,-5.066,-3.286,0.804,1.088,1.628,1.882,...,-8.372,0.417,-6.032,-1.943,-0.183,co2a0000377,True,nomatch,False,False
2,2,-1.333,-3.448,2.625,-5.554,-2.797,1.292,0.6,3.092,1.882,...,-3.977,0.905,-4.079,1.475,3.723,co2a0000377,True,nomatch,False,False
3,3,-0.356,-2.472,2.136,-5.066,-1.821,0.804,0.6,3.092,1.394,...,3.347,0.905,-1.149,3.428,5.188,co2a0000377,True,nomatch,False,False
4,4,0.621,-1.495,1.16,-4.578,-0.844,0.315,0.112,2.604,0.417,...,10.183,-0.071,2.268,3.916,2.747,co2a0000377,True,nomatch,False,False


In [28]:
column_names_index = test_df.columns

    # Get column names as a list
column_names_list = test_df.columns.tolist()

print(column_names_index)
print(column_names_list)

Index(['sample', 'AF1', 'AF2', 'AF7', 'AF8', 'AFZ', 'C1', 'C2', 'C3', 'C4',
       'C5', 'C6', 'CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6', 'CPZ', 'CZ', 'F1',
       'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'FC1', 'FC2', 'FC3', 'FC4',
       'FC5', 'FC6', 'FCZ', 'FP1', 'FP2', 'FPZ', 'FT7', 'FT8', 'FZ', 'O1',
       'O2', 'OZ', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'PO1',
       'PO2', 'PO7', 'PO8', 'POZ', 'PZ', 'T7', 'T8', 'TP7', 'TP8', 'X', 'Y',
       'nd', 'subject', 'alcoholic', 'match', 'err', 'object'],
      dtype='object')
['sample', 'AF1', 'AF2', 'AF7', 'AF8', 'AFZ', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6', 'CPZ', 'CZ', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'FC1', 'FC2', 'FC3', 'FC4', 'FC5', 'FC6', 'FCZ', 'FP1', 'FP2', 'FPZ', 'FT7', 'FT8', 'FZ', 'O1', 'O2', 'OZ', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'PO1', 'PO2', 'PO7', 'PO8', 'POZ', 'PZ', 'T7', 'T8', 'TP7', 'TP8', 'X', 'Y', 'nd', 'subject', 'alcoholic', 'match'

In [29]:
test_df['alcoholic'].value_counts()

alcoholic
False    61440
True     61440
Name: count, dtype: int64

In [21]:
validation_subjects = ['co2a0000377', 'co2a0000364', 'co2c0000342', 'co2c0000345']
validation_df = test_df[test_df['subject'].isin(validation_subjects)]
test_df = test_df[~test_df['subject'].isin(validation_subjects)]

In [22]:
drop_cols = ['sample', 'subject', 'match', 'err', 'object', 'X', 'Y', 'nd']
train_df_clean = train_df.drop(columns=drop_cols)
test_df_clean = test_df.drop(columns=drop_cols)

X_train = train_df_clean.drop('alcoholic', axis=1)
y_train = train_df_clean['alcoholic']
X_test = test_df_clean.drop('alcoholic', axis=1)
y_test = test_df_clean['alcoholic']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

In [24]:
y_pred = knn.predict(X_test_scaled)

In [25]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [26]:
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.69
Confusion Matrix:
[[32402 29038]
 [ 8925 52515]]
Classification Report:
              precision    recall  f1-score   support

       False       0.78      0.53      0.63     61440
        True       0.64      0.85      0.73     61440

    accuracy                           0.69    122880
   macro avg       0.71      0.69      0.68    122880
weighted avg       0.71      0.69      0.68    122880



In [30]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
validation_df_clean = validation_df.drop(columns=drop_cols)

In [33]:
X_validation = validation_df_clean.drop('alcoholic', axis=1)
y_validation = validation_df_clean['alcoholic']

In [34]:
X_validation_scaled = scaler.transform(X_validation)

In [35]:
# Combine training and validation data
X_combined = pd.concat([pd.DataFrame(X_train_scaled), pd.DataFrame(X_validation_scaled)])
y_combined = pd.concat([y_train, y_validation])

# Sample a small percentage of the data
sample_size = 0.05  # 5% of the data
X_sampled, _, y_sampled, _ = train_test_split(X_combined, y_combined, test_size=1-sample_size, random_state=42, stratify=y_combined)
