In [1]:
from google.colab import userdata
import json

# Get the Kaggle credentials from Colab's userdata
username = userdata.get("KAGGLE_USER")
key = userdata.get("KAGGLE_KEY")

# Echo the credentials into the kaggle.json file
!mkdir -p ~/.kaggle
!echo '{{"username":"{username}","key":"{key}"}}' > ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json


In [2]:
!kaggle competitions download -c io-t-sleep-stage-classification-version-2
!unzip /content/io-t-sleep-stage-classification-version-2.zip && rm -rf /content/io-t-sleep-stage-classification-version-2.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: test_segment/test_segment/test003/test003_00692.csv  
  inflating: test_segment/test_segment/test003/test003_00693.csv  
  inflating: test_segment/test_segment/test003/test003_00694.csv  
  inflating: test_segment/test_segment/test003/test003_00695.csv  
  inflating: test_segment/test_segment/test003/test003_00696.csv  
  inflating: test_segment/test_segment/test003/test003_00697.csv  
  inflating: test_segment/test_segment/test003/test003_00698.csv  
  inflating: test_segment/test_segment/test003/test003_00699.csv  
  inflating: test_segment/test_segment/test003/test003_00700.csv  
  inflating: test_segment/test_segment/test004/test004_00000.csv  
  inflating: test_segment/test_segment/test004/test004_00001.csv  
  inflating: test_segment/test_segment/test004/test004_00002.csv  
  inflating: test_segment/test_segment/test004/test004_00003.csv  
  inflating: test_segment/test_segment/test004/test004_00004.csv

In [3]:
import pandas as pd
import glob

file_list = glob.glob("/content/train/train/*.csv")
all_data = []
for file in file_list:
    try:
        df = pd.read_csv(file)

        all_data.append(df)

    except Exception as e:
        print(f"⚠️ อ่านไฟล์ {file} ไม่ได้: {e}")

combined_df = pd.concat(all_data, ignore_index=True)
combined_df

📊 ข้อมูลรวมทั้งหมดมีขนาด: (31907040, 9)


Unnamed: 0,BVP,ACC_X,ACC_Y,ACC_Z,TEMP,EDA,HR,IBI,Sleep_Stage
0,2.418650,-14.828296,30.645146,53.385008,33.264834,0.278546,88.030792,0.787580,W
1,1.526770,-14.828296,30.645146,52.653695,33.264823,0.278577,88.030735,0.788122,W
2,2.428460,-14.828296,30.645146,53.279565,33.264756,0.278481,88.030291,0.787050,W
3,0.924600,-14.828296,30.645146,53.049354,33.264908,0.278677,88.031277,0.788368,W
4,0.188797,-14.828296,30.645146,53.178063,33.264693,0.277723,88.029887,0.786861,W
...,...,...,...,...,...,...,...,...,...
31907035,7.471146,-7.955707,37.565038,50.902734,35.143026,0.106351,56.713585,0.942194,W
31907036,-0.556221,-7.814153,37.564986,51.253416,35.143145,0.106385,56.712603,0.942242,W
31907037,-7.292864,-8.163047,37.565050,50.536319,35.142969,0.106378,56.714083,0.942190,W
31907038,-15.821066,-8.264469,37.564994,50.477461,35.143138,0.106369,56.712645,0.942230,W


In [4]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from scipy.fftpack import fft

segment_size = 480

def extract_features(segment):
    features = {}

    # ดึงค่าทางสถิติจากแต่ละเซ็นเซอร์
    for col in ["BVP", "ACC_X", "ACC_Y", "ACC_Z", "TEMP", "EDA", "HR", "IBI"]:
        values = segment[col].values
        features[f"mean_{col}"] = np.mean(values)
        features[f"std_{col}"] = np.std(values)
        features[f"max_{col}"] = np.max(values)
        features[f"min_{col}"] = np.min(values)
        features[f"median_{col}"] = np.median(values)
        features[f"skew_{col}"] = skew(values)
        features[f"kurtosis_{col}"] = kurtosis(values)

    # คำนวณ SMA (Signal Magnitude Area) สำหรับการเคลื่อนไหว
    features["sma_acc"] = np.sum(np.abs(segment["ACC_X"]) + np.abs(segment["ACC_Y"]) + np.abs(segment["ACC_Z"]))

    # คำนวณพลังงานของสัญญาณ (Energy)
    for col in ["BVP", "EDA", "HR"]:
        features[f"energy_{col}"] = np.sum(np.square(values)) / len(values)


    fft_values = np.abs(fft(segment["HR"].values))
    features["fft_peak_hr"] = np.argmax(fft_values)


    features["Sleep_Stage"] = segment["Sleep_Stage"].values[0]

    return features


feature_list = []
for i in range(0, len(combined_df), segment_size):
    segment = combined_df.iloc[i:i+segment_size]
    if len(segment) == segment_size:
        feature_list.append(extract_features(segment))


feature_df = pd.DataFrame(feature_list)

  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder

# แยก Features และ Labels
X = feature_df.drop(columns=["Sleep_Stage"])
# แปลง Sleep_Stage จากตัวอักษรเป็นตัวเลข
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(feature_df["Sleep_Stage"])

# ดูการแมปค่า Label
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# ใช้ Random Forest เพื่อเลือก Features ที่สำคัญ
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
selector.fit(X, y)

# เลือกเฉพาะ Features ที่สำคัญ
X_selected = X.loc[:, selector.get_support()]





In [6]:
X_selected

Unnamed: 0,std_BVP,max_BVP,min_BVP,kurtosis_BVP,mean_ACC_X,std_ACC_X,max_ACC_X,min_ACC_X,median_ACC_X,mean_ACC_Y,...,mean_TEMP,max_TEMP,min_TEMP,median_TEMP,mean_EDA,max_EDA,min_EDA,median_EDA,std_HR,sma_acc
0,3.380333,6.721362,-10.770070,-0.493754,-14.355264,2.244520e+00,-0.757936,-19.726688,-14.828335,30.464842,...,33.251327,33.275627,33.222905,33.252814,0.278338,0.281467,0.275844,0.278518,0.308713,47081.203465
1,3.935875,31.704457,-14.867657,13.311258,-7.918330,7.591805e-02,-7.691100,-8.768982,-7.908425,30.633918,...,33.247007,33.268529,33.223404,33.245042,0.278222,0.281402,0.275781,0.278512,0.670249,44602.966736
2,3.048122,5.466780,-6.612507,-0.977987,-7.908425,9.196477e-10,-7.908425,-7.908425,-7.908425,30.645146,...,33.250310,33.266686,33.223404,33.245071,0.278463,0.280200,0.274561,0.278546,0.622197,44603.515631
3,3.019995,7.930728,-6.514300,-0.886724,-7.976691,2.974962e-01,-6.830426,-9.157902,-7.908425,30.666774,...,33.248338,33.267122,33.221574,33.245045,0.279496,0.282604,0.276923,0.279736,0.281799,44634.303798
4,3.080849,8.962223,-11.341948,-0.315907,-9.390520,4.679507e-01,-5.900315,-10.680165,-9.463630,30.620402,...,33.250956,33.268529,33.222921,33.245079,0.279994,0.282696,0.276998,0.279834,0.830735,45270.894875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66468,22.398665,40.263323,-42.427387,-1.208328,-9.843343,1.449272e-01,-8.811666,-10.077662,-9.885198,37.415862,...,35.021138,35.046070,34.981172,35.024394,0.109874,0.111710,0.106086,0.110145,1.775145,46957.876672
66469,21.826509,42.411884,-46.638104,-1.109979,-9.637162,4.684972e-01,-7.839029,-10.907642,-9.883842,37.409413,...,35.059321,35.085615,35.036469,35.063977,0.109759,0.112879,0.106055,0.110080,0.769390,46869.171661
66470,22.656818,43.579126,-44.022952,-1.091839,-9.811608,2.570399e-01,-8.779109,-11.453235,-9.885511,37.121866,...,35.097558,35.147714,35.040476,35.093589,0.107185,0.109271,0.104824,0.107511,0.730192,46878.500774
66471,22.659089,38.140872,-43.031622,-1.189028,-7.978950,3.906126e-01,-6.726895,-10.024153,-7.908425,37.578745,...,35.142043,35.166545,35.088962,35.143062,0.107372,0.109168,0.104861,0.107626,1.167451,46125.325063


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import xgboost as xgb


X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# โมเดลที่ 1: Random Forest
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# โมเดลที่ 2: XGBoost
xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)


rf_acc = accuracy_score(y_test, rf_pred)
xgb_acc = accuracy_score(y_test, xgb_pred)
rf_f1 = f1_score(y_test, rf_pred, average='weighted')
xgb_f1 = f1_score(y_test, xgb_pred, average='weighted')

print(f"🎯 Random Forest: Accuracy = {rf_acc:.4f}, F1-score = {rf_f1:.4f}")
print(f"🚀 XGBoost: Accuracy = {xgb_acc:.4f}, F1-score = {xgb_f1:.4f}")


🎯 Random Forest: Accuracy = 0.7447, F1-score = 0.6968
🚀 XGBoost: Accuracy = 0.8618, F1-score = 0.8576


In [23]:
import pandas as pd
import glob
import os
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.fftpack import fft

# กำหนด path หลักที่เก็บไฟล์ทดสอบ
test_path = "/content/test_segment/test_segment/"

# ค้นหาไฟล์ CSV ทุกไฟล์ภายใต้ test_segment (รวมทุกโฟลเดอร์ย่อย)
test_file_list = glob.glob(os.path.join(test_path, "**/*.csv"), recursive=True)

# ลิสต์เก็บ DataFrame ของแต่ละไฟล์ที่ผ่าน Feature Extraction แล้ว
feature_data = []

# ฟังก์ชัน Feature Extraction จาก Segment
def extract_features(segment, subject_id):
    features = {}

    # ดึงค่าทางสถิติจากแต่ละเซ็นเซอร์
    for col in ["BVP", "ACC_X", "ACC_Y", "ACC_Z", "TEMP", "EDA", "HR", "IBI"]:
        values = segment[col].values
        features[f"mean_{col}"] = np.mean(values)
        features[f"std_{col}"] = np.std(values)
        features[f"max_{col}"] = np.max(values)
        features[f"min_{col}"] = np.min(values)
        features[f"median_{col}"] = np.median(values)
        features[f"skew_{col}"] = skew(values)
        features[f"kurtosis_{col}"] = kurtosis(values)

    # คำนวณ SMA (Signal Magnitude Area) สำหรับการเคลื่อนไหว
    features["sma_acc"] = np.sum(np.abs(segment["ACC_X"]) + np.abs(segment["ACC_Y"]) + np.abs(segment["ACC_Z"]))

    # คำนวณพลังงานของสัญญาณ (Energy)
    for col in ["BVP", "EDA", "HR"]:
        features[f"energy_{col}"] = np.sum(np.square(values)) / len(values)

    # คำนวณ FFT (Peak Frequency)
    fft_values = np.abs(fft(segment["HR"].values))
    features["fft_peak_hr"] = np.argmax(fft_values)  # จุดสูงสุดของสัญญาณ HR

    # ระบุ Segment_ID
    features["Segment_ID"] = subject_id

    return features


segment_size = 480

for file in test_file_list:
    try:
        df = pd.read_csv(file)


        subject_id = os.path.basename(file).replace(".csv", "")


        for i in range(0, len(df), segment_size):
            segment = df.iloc[i:i+segment_size]
            if len(segment) == segment_size:
                feature_data.append(extract_features(segment, subject_id))

    except Exception as e:
        print(f"⚠️ อ่านไฟล์ {file} ไม่ได้: {e}")


test_feature_df = pd.DataFrame(feature_data)

test_feature_df


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(values)
  features[f"kurtosis_{col}"] = kurtosis(values)
  features[f"skew_{col}"] = skew(va

Unnamed: 0,mean_BVP,std_BVP,max_BVP,min_BVP,median_BVP,skew_BVP,kurtosis_BVP,mean_ACC_X,std_ACC_X,max_ACC_X,...,min_IBI,median_IBI,skew_IBI,kurtosis_IBI,sma_acc,energy_BVP,energy_EDA,energy_HR,fft_peak_hr,Segment_ID
0,-0.660568,49.584365,78.714863,-95.118212,7.442161,-0.278681,-1.117319,-45.443401,0.114677,-44.715476,...,0.892994,0.941676,-0.080938,-0.708926,49825.810290,0.881601,0.881601,0.881601,0,test006_00139
1,-0.282654,45.027972,100.510150,-102.085505,8.021984,-0.255636,-1.079404,-63.262251,0.295319,-60.465970,...,0.877557,0.911473,0.080295,-1.185611,36464.000393,0.853562,0.853562,0.853562,0,test006_00060
2,1.254593,97.369294,154.554149,-194.467723,20.694666,-0.333876,-1.127100,-57.339169,0.035139,-57.228456,...,0.894521,0.973573,-0.411592,-0.739105,44557.022975,0.945593,0.945593,0.945593,0,test006_00370
3,0.063165,84.857607,148.536971,-195.968385,12.190177,-0.301702,-1.127790,-57.410159,0.247891,-56.192492,...,0.922523,1.003771,-0.285750,-0.529898,44464.659667,0.996818,0.996818,0.996818,0,test006_00391
4,-0.888846,62.167925,103.825793,-117.624006,6.300406,-0.152799,-1.201254,42.536589,0.109506,43.270503,...,0.938563,0.972658,-0.078236,-1.007763,49364.876491,0.936236,0.936236,0.936236,0,test006_00580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7005,-0.021821,7.182946,14.357846,-14.613163,0.750254,-0.091152,-1.091981,34.084671,0.437056,35.517934,...,0.669161,0.801200,-0.614056,2.339741,52151.281852,0.628774,0.628774,0.628774,0,test004_00095
7006,0.047897,2.972990,7.344961,-7.039800,0.299656,-0.109628,-0.759120,-46.286700,0.257063,-45.268635,...,0.733130,0.818571,0.031073,-1.292938,43685.266796,0.701736,0.701736,0.701736,0,test004_00020
7007,-0.002356,4.867600,8.980665,-11.299846,0.568210,-0.254652,-1.092727,22.318288,0.506093,23.605235,...,0.750933,0.802522,2.092234,7.324980,51063.000753,0.644067,0.644067,0.644067,0,test004_00064
7008,0.063286,7.163264,14.056709,-13.331737,0.772995,-0.063407,-1.248231,44.474975,0.062850,44.575000,...,0.830301,0.895751,-0.279648,-0.931523,49221.205859,0.797515,0.797515,0.797515,0,test004_00622


In [9]:
test_feature_df.columns

Index(['mean_BVP', 'std_BVP', 'max_BVP', 'min_BVP', 'median_BVP', 'skew_BVP',
       'kurtosis_BVP', 'mean_ACC_X', 'std_ACC_X', 'max_ACC_X', 'min_ACC_X',
       'median_ACC_X', 'skew_ACC_X', 'kurtosis_ACC_X', 'mean_ACC_Y',
       'std_ACC_Y', 'max_ACC_Y', 'min_ACC_Y', 'median_ACC_Y', 'skew_ACC_Y',
       'kurtosis_ACC_Y', 'mean_ACC_Z', 'std_ACC_Z', 'max_ACC_Z', 'min_ACC_Z',
       'median_ACC_Z', 'skew_ACC_Z', 'kurtosis_ACC_Z', 'mean_TEMP', 'std_TEMP',
       'max_TEMP', 'min_TEMP', 'median_TEMP', 'skew_TEMP', 'kurtosis_TEMP',
       'mean_EDA', 'std_EDA', 'max_EDA', 'min_EDA', 'median_EDA', 'skew_EDA',
       'kurtosis_EDA', 'mean_HR', 'std_HR', 'max_HR', 'min_HR', 'median_HR',
       'skew_HR', 'kurtosis_HR', 'mean_IBI', 'std_IBI', 'max_IBI', 'min_IBI',
       'median_IBI', 'skew_IBI', 'kurtosis_IBI', 'sma_acc', 'energy_BVP',
       'energy_EDA', 'energy_HR', 'fft_peak_hr', 'Segment_ID'],
      dtype='object')

In [10]:
X_selected.columns

Index(['std_BVP', 'max_BVP', 'min_BVP', 'kurtosis_BVP', 'mean_ACC_X',
       'std_ACC_X', 'max_ACC_X', 'min_ACC_X', 'median_ACC_X', 'mean_ACC_Y',
       'std_ACC_Y', 'max_ACC_Y', 'min_ACC_Y', 'median_ACC_Y', 'mean_ACC_Z',
       'std_ACC_Z', 'max_ACC_Z', 'min_ACC_Z', 'median_ACC_Z', 'mean_TEMP',
       'max_TEMP', 'min_TEMP', 'median_TEMP', 'mean_EDA', 'max_EDA', 'min_EDA',
       'median_EDA', 'std_HR', 'sma_acc'],
      dtype='object')

In [24]:
selected_columns = X_selected.columns

common_columns = [col for col in selected_columns if col in test_feature_df.columns]

test_feature_df1 = test_feature_df[common_columns]
test_feature_df1.columns

Index(['std_BVP', 'max_BVP', 'min_BVP', 'kurtosis_BVP', 'mean_ACC_X',
       'std_ACC_X', 'max_ACC_X', 'min_ACC_X', 'median_ACC_X', 'mean_ACC_Y',
       'std_ACC_Y', 'max_ACC_Y', 'min_ACC_Y', 'median_ACC_Y', 'mean_ACC_Z',
       'std_ACC_Z', 'max_ACC_Z', 'min_ACC_Z', 'median_ACC_Z', 'mean_TEMP',
       'max_TEMP', 'min_TEMP', 'median_TEMP', 'mean_EDA', 'max_EDA', 'min_EDA',
       'median_EDA', 'std_HR', 'sma_acc'],
      dtype='object')

In [25]:
predictions = rf_model.predict(test_feature_df1)

# แปลง Label กลับจากตัวเลข → ตัวอักษร (W, N, R)
predictions = label_encoder.inverse_transform(predictions)

In [26]:
submission_df = pd.DataFrame({
    "id": test_feature_df["Segment_ID"],
    "labels": predictions
})
submission_df = submission_df.sort_values(by=["id"]).reset_index(drop=True)

In [27]:
submission_df.head()

Unnamed: 0,id,labels
0,test001_00000,W
1,test001_00001,W
2,test001_00002,W
3,test001_00003,W
4,test001_00004,W


In [28]:
submission_df.to_csv("submission_rf.csv", index=False)