In [56]:
import pyarrow
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import check_scoring
train_data=pd.read_csv("train.csv",index_col=0)

# Data Exploration #

In [145]:
train_data.select_dtypes(include=["object"])

Unnamed: 0,id,Basic_Demos-Enroll_Season,CGAS-Season,Physical-Season,Fitness_Endurance-Season,FGC-Season,BIA-Season,PAQ_A-Season,PAQ_C-Season,PCIAT-Season,SDS-Season,PreInt_EduHx-Season
0,00008ff9,Fall,Winter,Fall,,Fall,Fall,,,Fall,,Fall
1,000fd460,Summer,,Fall,,Fall,Winter,,Fall,Fall,Fall,Summer
2,00105258,Summer,Fall,Fall,Fall,Fall,,,Summer,Fall,Fall,Summer
3,00115b9f,Winter,Fall,Summer,Summer,Summer,Summer,,Winter,Summer,Summer,Winter
4,0016bb22,Spring,Summer,,,,,Summer,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,Spring,Fall,,Fall,Fall,,Winter,Winter,Winter,Fall
3956,ffa9794a,Winter,,Spring,,Spring,Spring,,Winter,,,Winter
3957,ffcd4dbd,Fall,Spring,Winter,,Winter,Winter,,Winter,Winter,Winter,Fall
3958,ffed1dd5,Spring,Spring,Winter,,Spring,Summer,,Spring,Spring,Spring,Spring


# Visualisation #

In [146]:
train_data.select_dtypes(include=["int","float"]).corr()

Unnamed: 0,level_0,index,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,...,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii,Wear_time,Mean,enmo_25%,enmo_50%,enmo_75%,enmo_equals_0_percent
level_0,1.000000,1.000000,0.009360,0.009816,-0.002399,0.016609,0.017452,0.013867,0.009299,-0.001898,...,0.020931,0.022212,0.023235,0.010790,0.036538,-0.030288,-0.060821,-0.054265,-0.033184,0.072838
index,1.000000,1.000000,0.009360,0.009816,-0.002399,0.016609,0.017452,0.013867,0.009299,-0.001898,...,0.020931,0.022212,0.023235,0.010790,0.036538,-0.030288,-0.060821,-0.054265,-0.033184,0.072838
Basic_Demos-Age,0.009360,0.009360,1.000000,0.064004,-0.009994,0.492112,0.880274,0.748149,0.609601,0.107960,...,0.037317,0.037115,0.441822,0.365990,-0.102264,-0.335151,-0.134023,-0.199548,-0.242762,0.081662
Basic_Demos-Sex,0.009816,0.009816,0.064004,1.000000,0.075470,0.025793,-0.014454,0.003245,-0.020013,0.002076,...,-0.020122,-0.018508,-0.000579,-0.100148,-0.032520,-0.132809,-0.020788,-0.036726,-0.062747,-0.002935
CGAS-CGAS_Score,-0.002399,-0.002399,-0.009994,0.075470,1.000000,-0.094074,-0.023185,-0.076399,-0.076116,-0.005292,...,-0.145369,-0.148457,-0.021596,-0.085261,-0.137368,-0.129561,-0.148559,-0.161263,-0.150575,0.101827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mean,-0.030288,-0.030288,-0.335151,-0.132809,-0.129561,-0.161550,-0.331752,-0.276492,-0.320046,-0.085294,...,-0.037118,-0.035706,-0.175950,-0.205348,0.704215,1.000000,0.670928,0.854668,0.948880,-0.447379
enmo_25%,-0.060821,-0.060821,-0.134023,-0.020788,-0.148559,-0.008288,-0.124510,-0.065567,-0.105601,-0.038874,...,-0.002489,-0.000129,-0.055298,-0.097237,0.323865,0.670928,1.000000,0.875726,0.741482,-0.471620
enmo_50%,-0.054265,-0.054265,-0.199548,-0.036726,-0.161263,-0.040635,-0.199505,-0.127303,-0.123239,-0.038577,...,-0.021594,-0.019374,-0.107729,-0.155682,0.573601,0.854668,0.875726,1.000000,0.924692,-0.542229
enmo_75%,-0.033184,-0.033184,-0.242762,-0.062747,-0.150575,-0.076395,-0.249976,-0.179234,-0.249017,-0.059962,...,-0.038235,-0.036293,-0.133457,-0.184841,0.683228,0.948880,0.741482,0.924692,1.000000,-0.491830


# Data Cleaning #

In [147]:
def clean_data():
    train_data=pd.read_csv("train.csv",index_col=0)
    train_data.drop(['PCIAT-PCIAT_Total', "PCIAT-Season", 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_16'],axis=1,inplace=True)

# Simple Approach #

## Data Preparation ##

In [None]:
clean_data()
num_col=list(train_data.select_dtypes(include=["int","float"]).columns)
cat_col=list(train_data.select_dtypes(include=["object"]).columns)
num_pipeline=Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
    #("std_scaler", StandardScaler())
])
pipeline=ColumnTransformer([
    ("num", num_pipeline,num_col),
    ("cat", OneHotEncoder(sparse_output=False,handle_unknown="infrequent_if_exist"),cat_col)
])
prepared_data = pipeline.fit_transform(train_data)
transformed_data = pd.DataFrame(prepared_data, columns=num_col + list(pipeline.named_transformers_["cat"].get_feature_names_out(cat_col)))
Y=transformed_data["sii"]
X=transformed_data.copy()
X.drop("sii",inplace=True,axis=1)
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,train_size=0.8,random_state=10)

## Random Forest ##

#### Model Training ####

In [15]:
%%time
forest=RandomForestClassifier()
parameter={"n_estimators":[50,100,200,400],"max_depth":[None,1,2,4,8,16]}
forest_cv=GridSearchCV(forest,parameter,n_jobs=2)
forest_cv.fit(X_train,Y_train)

CPU times: total: 469 ms
Wall time: 26.9 s


#### Evaluation ####

In [16]:
prediction=pd.DataFrame(forest_cv.predict(X_test))
scoring=check_scoring(forest_cv,scoring="accuracy")
scoring(forest_cv,X_test,Y_test)

0.7297979797979798

#### Output ####

In [43]:
test_data=pd.read_csv("test.csv")
test_data["sii"]=0
test_data=pipeline.transform(test_data)
test_data = pd.DataFrame(test_data, columns=num_col + list(pipeline.named_transformers_["cat"].get_feature_names_out(cat_col)))
test_data.drop("sii",axis=1,inplace=True)
prediction=forest_cv.predict(test_data)
prediction=pd.DataFrame(prediction)
test_data_ids=pd.read_csv("test.csv")
prediction["id"]=test_data_ids["id"]
prediction.set_index("id",inplace=True)
prediction.to_csv("random_forest_dropped_features.csv")

# Activity Sensor Augmented Approach #

## Data Preparation ##

In [191]:
%%time
clean_data()
not_found=0
for x in range(len(train_data)):
    id=train_data.loc[x,"id"]
    file_path="Tracker/id=" +id + "/part-0.parquet"
    if not os.path.exists(file_path):
        not_found+=1
        continue
    parquet=pd.read_parquet(file_path)
    train_data.loc[x,"Wear_time"]=parquet["non-wear_flag"].value_counts()[0]/len(parquet)
    train_data.loc[x,"Mean"]=parquet["enmo"].describe()["mean"]
    train_data.loc[x,"enmo_25%"]=parquet["enmo"].describe()["25%"]
    train_data.loc[x,"enmo_50%"]=parquet["enmo"].describe()["50%"]
    train_data.loc[x,"enmo_75%"]=parquet["enmo"].describe()["75%"]
    if len(parquet[parquet["enmo"].isin([0])])>0:
        train_data.loc[x,"enmo_equals_0_percent"]=parquet["enmo"].value_counts()[0]/len(parquet)
    parquet=None
    
splitter=train_data["Wear_time"]>=0
activity_dataset=train_data[splitter==True]
missing_activity_dataset=train_data[splitter==False]

transformed_data_activity = pipeline.fit_transform(activity_dataset)
transformed_data_activity = pd.DataFrame(prepared_data, columns=num_col + list(pipeline.named_transformers_["cat"].get_feature_names_out(cat_col)))
Y1=transformed_data["sii"]
X1=transformed_data.copy()
X1.drop("sii",inplace=True,axis=1)
X1_train,X1_test,Y1_train,Y1_test=train_test_split(X,Y,test_size=0.2,train_size=0.8,random_state=10)

transformed_data = pipeline.fit_transform(missing_activity_dataset)
transformed_data = pd.DataFrame(prepared_data, columns=num_col + list(pipeline.named_transformers_["cat"].get_feature_names_out(cat_col)))
Y2=transformed_data["sii"]
X2=transformed_data.copy()
X2.drop("sii",inplace=True,axis=1)
X2_train,X2_test,Y2_train,Y2_test=train_test_split(X,Y,test_size=0.2,train_size=0.8,random_state=10)

transformed_data = pipeline.fit_transform(train_data)
transformed_data = pd.DataFrame(prepared_data, columns=num_col + list(pipeline.named_transformers_["cat"].get_feature_names_out(cat_col)))
Y3=transformed_data["sii"]
X3=transformed_data.copy()
X3.drop("sii",inplace=True,axis=1)
X3_train,X3_test,Y3_train,Y3_test=train_test_split(X,Y,test_size=0.2,train_size=0.8,random_state=10)

CPU times: total: 2min 7s
Wall time: 1min 23s


#### Model Training ####

In [198]:
%%time
forest1=RandomForestClassifier()
parameter={"n_estimators":[50,100,200,300,400],"max_depth":[None,1,2,4,8,16],"max_leaf_nodes":[None,1,2,3,4,6],"min_samples_split":[2,4,8,16],"min_samples_leaf":[1,2,4,8,16],"criterion":["gini", "entropy", "log_loss"]}
forest_cv1=GridSearchCV(forest1,parameter,n_jobs=2)
forest_cv1.fit(X1_train,Y1_train)

forest2=RandomForestClassifier()
forest_cv2=GridSearchCV(forest2,parameter,n_jobs=2)
forest_cv2.fit(X2_train,Y2_train)

forest3=RandomForestClassifier()
forest_cv3=GridSearchCV(forest3,parameter,n_jobs=2)
forest_cv3.fit(X3_train,Y3_train)

9000 fits failed out of a total of 54000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9000 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kay\PycharmProjects\practice\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kay\PycharmProjects\practice\venv\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\Kay\PycharmProjects\practice\venv\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Kay\PycharmProjects\practice\venv\lib\site-packages\sklearn\utils\_param_valida

CPU times: total: 2min 54s
Wall time: 6h 8min 59s


### Evaluation ###

In [202]:
prediction1=pd.DataFrame(forest_cv1.predict(X1_test))
scoring=check_scoring(forest_cv1,scoring="accuracy")
scoring(forest_cv1,X1_test,Y1_test)

0.7361111111111112

In [203]:
prediction2=pd.DataFrame(forest_cv2.predict(X2_test))
scoring=check_scoring(forest_cv2,scoring="accuracy")
scoring(forest_cv2,X2_test,Y2_test)

0.7247474747474747

In [204]:
prediction3=pd.DataFrame(forest_cv3.predict(X3_test))
scoring=check_scoring(forest_cv3,scoring="accuracy")
scoring(forest_cv3,X3_test,Y3_test)

0.7335858585858586