In [109]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load the data
data_path = 'sph6004_assignment1_data.csv'
data = pd.read_csv(data_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,id,aki,gender,admission_age,race,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,...,ggt_max,ld_ldh_min,ld_ldh_max,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,height,weight_admit
0,36570066,3,F,79.953141,BLACK/AFRICAN AMERICAN,96.0,104.0,100.083333,103.0,126.0,...,,236.0,318.0,15.0,6.0,5.0,4.0,0.0,157.0,110.0
1,39307659,0,F,78.194169,WHITE - RUSSIAN,72.0,134.0,97.263158,97.0,127.0,...,,,,15.0,6.0,5.0,4.0,0.0,,82.0
2,38743306,2,F,65.602396,WHITE,60.0,97.0,84.166667,95.0,143.0,...,,,,15.0,6.0,5.0,4.0,0.0,,62.1
3,32339865,2,F,64.906629,UNKNOWN,59.0,87.0,71.461538,113.0,150.0,...,,,,15.0,1.0,0.0,1.0,1.0,170.0,113.1
4,35526987,2,M,57.438861,WHITE,57.0,100.0,82.387097,81.0,127.0,...,,,,15.0,,0.0,1.0,1.0,178.0,97.4


In [11]:
# Check for missing values
missing_values = data.isnull().mean() * 100

# Categorical columns for encoding
categorical_cols = data.select_dtypes(include=['object']).columns

# Summary of missing values and categorical columns
missing_values_summary = missing_values[missing_values > 0].sort_values(ascending=False)


missing_values_summary

thrombin_max       99.821288
thrombin_min       99.821288
d_dimer_min        99.785939
d_dimer_max        99.785939
ggt_max            99.073056
                     ...    
dbp_min             0.190495
dbp_mean            0.190495
heart_rate_mean     0.155145
heart_rate_max      0.155145
heart_rate_min      0.155145
Length: 157, dtype: float64

In [29]:
columns_to_keep = missing_values[missing_values <= 30].index

In [30]:
len(columns_to_keep)

65

In [36]:
columns_to_keep = missing_values[missing_values <= 30].index
data_filtered_less_missing_cols = data[columns_to_keep]
# Determine if there are rows with a high percentage of missing values in the filtered dataset
missing_values_by_row_filtered = data_filtered_less_missing_cols.isnull().sum(axis=1)

# Calculate the percentage of missing values for each row in the filtered dataset
percentage_missing_by_row_filtered = (missing_values_by_row_filtered / data_filtered_less_missing_cols.shape[1]) * 100

# Check the number of rows with more than 50% missing values in the filtered dataset
rows_with_high_missing_values_filtered = percentage_missing_by_row_filtered[percentage_missing_by_row_filtered > 50].count()

rows_with_high_missing_values_filtered


157

In [37]:
# Remove rows with more than 50% missing values in the filtered dataset
data_cleaned = data_filtered_less_missing_cols[percentage_missing_by_row_filtered <= 50]

# Check the shape of the dataset after removing rows with high missing values
data_cleaned_shape = data_cleaned.shape

# Verify the cleanup by checking if there are still rows with high missing values
missing_values_by_row_cleaned = data_cleaned.isnull().sum(axis=1)
percentage_missing_by_row_cleaned = (missing_values_by_row_cleaned / data_cleaned.shape[1]) * 100

data_cleaned_shape


(50763, 65)

In [38]:
# Check for missing values
missing_values_cleaned = data_cleaned.isnull().mean() * 100

# Summary of missing values and categorical columns
missing_values_cleaned_summary = missing_values_cleaned[missing_values_cleaned > 0].sort_values(ascending=False)


missing_values_cleaned_summary

ptt_max              10.222012
ptt_min              10.222012
inr_max               9.686189
inr_min               9.686189
pt_max                9.684219
pt_min                9.684219
calcium_min.1         9.331600
calcium_max.1         9.331600
temperature_max       3.185391
temperature_min       3.185391
temperature_mean      3.185391
glucose_mean          2.013277
glucose_max           2.013277
glucose_min           2.013277
weight_admit          1.991608
glucose_min.2         1.246971
glucose_max.2         1.246971
gcs_motor             1.245001
aniongap_min          0.998759
aniongap_max          0.998759
gcs_verbal            0.973150
potassium_min.1       0.935721
potassium_max.1       0.935721
bicarbonate_max.1     0.898292
bicarbonate_min.1     0.898292
sodium_max.1          0.880563
sodium_min.1          0.880563
chloride_max.1        0.874653
chloride_min.1        0.874653
gcs_eyes              0.705238
hemoglobin_min.1      0.644170
hemoglobin_max.1      0.644170
wbc_min 

In [41]:
data_cleaned = data_cleaned.fillna(0)

In [43]:
data_cleaned.to_csv('data_cleaned.csv')

In [66]:
# For simplicity, encode categorical variables manually (gender and race for now)
data_cleaned['gender'] = data_cleaned['gender'].map({'F': 0, 'M': 1})
data_cleaned = data_cleaned.drop(columns=['race', 'id'])
# Define the target variable (y) and the features (X)
X = data_cleaned.drop('aki', axis=1)
y = data_cleaned['aki']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [69]:

# Compute the correlation of each feature with the target variable
correlations = X.corrwith(y)

# Convert to a DataFrame, take the absolute values, and sort by correlation strength
correlations_df = correlations.abs().sort_values(ascending=False).reset_index()
correlations_df.columns = ['Feature', 'Absolute Correlation']

# Display the ranked features based on their absolute correlation with 'aki'
correlations_df


Unnamed: 0,Feature,Absolute Correlation
0,bun_max,0.242342
1,bun_min,0.234780
2,gcs_verbal,0.195807
3,dbp_min,0.181049
4,sbp_min,0.178101
...,...,...
57,chloride_min.1,0.006101
58,dbp_max,0.004455
59,glucose_min,0.001412
60,bicarbonate_max.1,0.000549


In [91]:
candidate_features = correlations_df[correlations_df['Absolute Correlation']>0.15]['Feature']
X_selected = data_cleaned[candidate_features]

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [101]:
X_train.columns

Index(['bun_max', 'bun_min', 'gcs_verbal', 'dbp_min', 'sbp_min', 'mbp_min',
       'admission_age', 'potassium_max.1', 'aniongap_max', 'ptt_max',
       'gcs_unable', 'pt_max', 'gcs_eyes', 'inr_max'],
      dtype='object')

In [104]:
# Implementing a basic version of Forward Selection
# Start with no variables and add them one by one
selected_features = []
current_score, best_new_score = 0.0, 0.0

while True:
    scores_with_candidates = []
    for feature in X_train.columns:
        if feature not in selected_features:
            X_train_selected = X_train[selected_features + [feature]]
            X_test_selected = X_test[selected_features + [feature]]
            rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
            rf_classifier.fit(X_train_selected, y_train)
            y_pred = rf_classifier.predict(X_test_selected)
            score = accuracy_score(y_test, y_pred)
            scores_with_candidates.append((score, feature))
            print(f'score:{score} candidate:{feature}')
    
    scores_with_candidates.sort(reverse=True)
    best_new_score, best_candidate = scores_with_candidates[0]
    
    if best_new_score > current_score:
        selected_features.append(best_candidate)
        current_score = best_new_score
    else:
        break  # Exit loop if no improvement

selected_features

score:0.40520043336944744 candidate:bun_max
score:0.40066975278242883 candidate:bun_min
score:0.3914114054959125 candidate:gcs_verbal
score:0.37594799566630555 candidate:dbp_min
score:0.3776223776223776 candidate:sbp_min
score:0.3725992317541613 candidate:mbp_min
score:0.28543287698217273 candidate:admission_age
score:0.3706293706293706 candidate:potassium_max.1
score:0.35437801634984734 candidate:aniongap_max
score:0.35152171771890084 candidate:ptt_max
score:0.3702353984044125 candidate:gcs_unable
score:0.3792967595784497 candidate:pt_max
score:0.37466758593519156 candidate:gcs_eyes
score:0.38422141239042645 candidate:inr_max
score:0.3935782527331823 candidate:bun_min
score:0.4076627597754358 candidate:gcs_verbal
score:0.3781148429035753 candidate:dbp_min
score:0.3697429331232148 candidate:sbp_min
score:0.37486457204767065 candidate:mbp_min
score:0.31123805771693097 candidate:admission_age
score:0.37555402344134736 candidate:potassium_max.1
score:0.38589579434649857 candidate:aniongap

['bun_max', 'gcs_unable']

In [103]:
scores_with_candidates

[(0.41012508618142424, 'gcs_eyes'),
 (0.4081552250566335, 'gcs_verbal'),
 (0.4020486555697823, 'bun_min'),
 (0.3993893430513149, 'inr_max'),
 (0.38638825962769624, 'aniongap_max'),
 (0.3829410026593125, 'potassium_max.1'),
 (0.37919826652221017, 'mbp_min'),
 (0.37791785679109624, 'dbp_min'),
 (0.3689549886732985, 'pt_max'),
 (0.3641288289175613, 'sbp_min'),
 (0.353393085787452, 'ptt_max'),
 (0.32020092583472864, 'admission_age')]

In [105]:
X_selected_final = data_cleaned[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected_final, y, test_size=0.2, random_state=42)

# random forest

In [106]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = rf_classifier.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.61      0.52      3319
           1       0.16      0.00      0.01      1989
           2       0.38      0.51      0.44      3309
           3       0.37      0.30      0.33      1536

    accuracy                           0.41     10153
   macro avg       0.34      0.36      0.32     10153
weighted avg       0.36      0.41      0.36     10153

Accuracy: 0.41101152368758004


# SVM

In [107]:
# Initializing the SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Fitting the SVM classifier to the Training set
svm_classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_svm = svm_classifier.predict(X_test)

# Evaluating the SVM classifier
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))


SVM Accuracy: 0.38303949571555207


# AdaBoost

In [111]:
# Initializing the AdaBoost classifier with a Decision Tree base estimator
ada_classifier = AdaBoostClassifier(
    n_estimators=100,
    random_state=42
)

# Fitting the AdaBoost classifier to the Training set
ada_classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_ada = ada_classifier.predict(X_test)

# Evaluating the AdaBoost classifier
print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_ada))




AdaBoost Accuracy: 0.41642864178075445


# Gradient Boost

In [112]:
# Initializing the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fitting the Gradient Boosting classifier to the Training set
gb_classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_gb = gb_classifier.predict(X_test)

# Evaluating the Gradient Boosting classifier
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))


Gradient Boosting Accuracy: 0.41770905151186843
