In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import itertools

In [2]:
# 1st Trial : RandomForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [3]:
df = pd.read_csv('./data./train.csv')
df.head()

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TRAIN_0000,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,TRAIN_0001,SARC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,TRAIN_0002,SKCM,R895R,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,TRAIN_0003,KIRC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,TRAIN_0004,GBMLGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [4]:
t_df = pd.read_csv('./data./test.csv')
t_df.head()

Unnamed: 0,ID,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,ABCA4,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TEST_0000,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,TEST_0001,WT,WT,WT,WT,WT,R587Q,WT,WT,WT,...,WT,WT,WT,WT,WT,I383Sfs,WT,WT,WT,WT
2,TEST_0002,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,TEST_0003,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,TEST_0004,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [5]:
# 1번 시도: 문제 있는 것들을 모두 mutation으로 변형 후 모델링 돌려보자
# df 복제하기
df_1 = df.copy()

In [6]:
# Feature, Target split
X = df_1.drop(columns=['SUBCLASS'])
y = df_1['SUBCLASS']

In [7]:
# mutation으로 변경하기
X = X.applymap(lambda x: 'Mutation' if x != 'WT' else x)

X.head()

Unnamed: 0,ID,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,ABCA4,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,Mutation,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,Mutation,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,Mutation,Mutation,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,Mutation,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,Mutation,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [8]:
# label encoding이용해서 범주형 데이터를 수치형으로 변환
# label_encoder = LabelEncoder()

In [9]:
# X 각 열에 대해 : LAbel Encoding 적용
label_encoders = {}
for col in X.columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [10]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Random Forest Train
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [12]:
# 예측 및 평가
y_pred = rf_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 score: {f1:.2f}')

F1 score: 0.25


In [13]:
df

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TRAIN_0000,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,TRAIN_0001,SARC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,TRAIN_0002,SKCM,R895R,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,TRAIN_0003,KIRC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,TRAIN_0004,GBMLGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6196,TRAIN_6196,LUAD,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
6197,TRAIN_6197,LGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
6198,TRAIN_6198,COAD,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,T181S,WT
6199,TRAIN_6199,TGCT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [5]:
# 2 second trial
df_2 = df.copy()

In [7]:
# 모든 변이들을 분리하는 함수
def split_mutations(mutation):
    if mutation == 'WT':
        return []
    return mutation.split()

In [7]:
# 변이들 분리해 새로운 DataFrame 생성
# ID 제외하고 유전체 컬럼들 정리하기
# genomic_columns = df_2.columns[2:]

# for col in genomic_columns:
#     df_2[f'{col}_Split'] = df_2[col].apply(split_mutations)

for col in df_2.columns[2:]:
    df_2[col] = df_2[col].apply(split_mutations)

In [17]:
# 각 변이들의 유니크한 리스트 생성
# all_mutations = set()
# for col in df_2.columns[2:]:
#     all_mutations.update(mutation for sublist in df_2[col] for mutation in sublist)

In [8]:
# 빈 DataFrame 생성 (추가할 원핫 인코딩된 컬럼들을 위해)
encoded_df = pd.DataFrame()

# 각 유전체 컬럼을 순회하며 원핫 인코딩 적용
for col in df_2.columns[2:]:
    # 각 유전체 컬럼에 대해 변이 리스트를 원핫 인코딩 처리
    temp_encoded = pd.get_dummies(df_2[col].apply(lambda x: ' '.join(x)), prefix=f'Mutation_{col}')
    # 기존 데이터프레임에 병합
    encoded_df = pd.concat([encoded_df, temp_encoded], axis=1)

In [9]:
df_2f = pd.concat([df_2[['ID', 'SUBCLASS']], encoded_df], axis=1)
df_2f.head()

Unnamed: 0,ID,SUBCLASS,Mutation_A2M_,Mutation_A2M_A1043T,Mutation_A2M_A1061V I1056I,Mutation_A2M_A1108T,Mutation_A2M_A1379A,Mutation_A2M_A615A,Mutation_A2M_A788A,Mutation_A2M_C1321Y I848I P529H,...,Mutation_ZYX_R59L,Mutation_ZYX_S182N,Mutation_ZYX_S258A,Mutation_ZYX_S303P,Mutation_ZYX_T480I,Mutation_ZYX_V212A,Mutation_ZYX_V234V,Mutation_ZYX_V471M R489Q,Mutation_ZYX_V524A,Mutation_ZYX_V560M
0,TRAIN_0000,KIPAN,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,TRAIN_0001,SARC,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,TRAIN_0002,SKCM,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,TRAIN_0003,KIRC,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,TRAIN_0004,GBMLGG,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# training 시작
train = df_2f[df_2f.columns[2:]]
target = df_2f['SUBCLASS']

In [11]:
rfc = RandomForestClassifier()

In [12]:
x_train, x_test, y_train, y_test = train_test_split(train, target, test_size=0.3)

In [13]:
rfc.fit(x_train, y_train)

In [14]:
# predict
y_predict = rfc.predict(x_test)
f1 = f1_score(y_test, y_predict, average='weighted')
print(f'F1 score: {f1:.2f}')

F1 score: 0.22


In [28]:
df.describe()

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
count,6201,6201,6201,6201,6201,6201,6201,6201,6201,6201,...,6201,6201,6201,6201,6201,6201,6201,6201,6201,6201
unique,6201,26,141,43,25,1,48,145,119,129,...,168,69,33,27,126,61,53,53,35,41
top,TRAIN_0000,BRCA,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
freq,1,786,6053,6157,6176,6201,6153,6045,6079,6069,...,6025,6131,6169,6174,6065,6139,6143,6142,6166,6159


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6201 entries, 0 to 6200
Columns: 4386 entries, ID to ZYX
dtypes: object(4386)
memory usage: 207.5+ MB


In [15]:
tdf = t_df.copy()

In [16]:
# t_df에 대해 ID 제외한 feature 데이터 추출
for col in tdf.columns[1:]:
    tdf[col] = tdf[col].astype(str)  # 모든 값을 문자열로 변환
    tdf[col] = tdf[col].apply(split_mutations)  # split_mutations 적용

In [17]:
# t_df에 적용하자
# 모든 변이들을 분리하는 함수
# t_df에 대해 ID 제외한 feature 데이터 추출
# for col in t_df.columns[1:]:
#     t_df[col] = t_df[col].apply(split_mutations)  # 각 컬럼의 변이 리스트를 처리

# 빈 DataFrame 생성 (추가할 원핫 인코딩된 컬럼들을 위해)
encoded_tdf = pd.DataFrame()

# 각 유전체 컬럼을 순회하며 원핫 인코딩 적용
for col in tdf.columns[1:]:
    # 각 유전체 컬럼에 대해 변이 리스트를 원핫 인코딩 처리
    temp_encoded = pd.get_dummies(tdf[col].apply(lambda x: ' '.join(x) if isinstance(x, list) else ''), prefix=f'Mutation_{col}')
    # 기존 데이터프레임에 병합
    encoded_tdf = pd.concat([encoded_tdf, temp_encoded], axis=1)

# ID 컬럼과 원핫 인코딩된 데이터 병합
t_dff = pd.concat([t_df[['ID']], encoded_tdf], axis=1)
t_dff.head()

Unnamed: 0,ID,Mutation_A2M_,Mutation_A2M_713_714MG>IR V87V,Mutation_A2M_A1003T A953T A1103T,Mutation_A2M_A1108T A1008T A958T,Mutation_A2M_A1160V A1260V A1110V,Mutation_A2M_A1215T,Mutation_A2M_A1219V A1069V A1119V,Mutation_A2M_A1260D A1160D A1110D,Mutation_A2M_A1279A A1229A A1379A,...,Mutation_ZYX_Q53P,Mutation_ZYX_Q53R,Mutation_ZYX_R5Afs,Mutation_ZYX_V398I V367I,Mutation_ZYX_V462D V493D,Mutation_ZYX_V493V V524V,Mutation_ZYX_V523L V492L,Mutation_ZYX_V60A,Mutation_ZYX_W200L,Mutation_ZYX_Y469Y Y500Y
0,TEST_0000,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,TEST_0001,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,TEST_0002,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,TEST_0003,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,TEST_0004,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [37]:
t_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2546 entries, 0 to 2545
Columns: 4385 entries, ID to ZYX
dtypes: object(4385)
memory usage: 85.2+ MB


In [19]:
float_columns = tdf.select_dtypes(include=['float64']).columns
float_columns

Index([], dtype='object')

In [20]:
tdf.isna().sum()

ID       0
A2M      0
AAAS     0
AADAT    0
AARS1    0
        ..
ZNRF4    0
ZPBP     0
ZW10     0
ZWINT    0
ZYX      0
Length: 4385, dtype: int64

In [18]:
test_id = t_dff['ID']
t_dff.drop(columns='ID', inplace=True)

In [19]:
rfccase = rfc.predict(t_dff)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Mutation_A2M_713_714MG>IR V87V
- Mutation_A2M_A1003T A953T A1103T
- Mutation_A2M_A1108T A1008T A958T
- Mutation_A2M_A1160V A1260V A1110V
- Mutation_A2M_A1215T
- ...
Feature names seen at fit time, yet now missing:
- Mutation_A2M_A1043T
- Mutation_A2M_A1061V I1056I
- Mutation_A2M_A1108T
- Mutation_A2M_A1379A
- Mutation_A2M_A615A
- ...


In [5]:
# 3rd trial
df_3 = df.copy()

In [6]:
# 모든 변이들을 분리하는 함수
def split_mutations(mutation):
    if mutation == 'WT':
        return []
    return mutation.split()

In [7]:
for col in df_3.columns[2:]:
    df_3[col] = df_3[col].apply(split_mutations)

In [8]:
encoded_df = pd.DataFrame()
# 각 유전체 컬럼을 순회하며 원핫 인코딩 적용
for col in df_3.columns[2:]:
    # 각 유전체 컬럼에 대해 변이 리스트를 원핫 인코딩 처리
    temp_encoded = pd.get_dummies(df_3[col].apply(lambda x: ' '.join(x)), prefix=f'Mutation_{col}')
    # 기존 데이터프레임에 병합
    encoded_df = pd.concat([encoded_df, temp_encoded], axis=1)

In [9]:
df_3f = pd.concat([df_3[['ID', 'SUBCLASS']], encoded_df], axis=1)
df_3f.head()

Unnamed: 0,ID,SUBCLASS,Mutation_A2M_,Mutation_A2M_A1043T,Mutation_A2M_A1061V I1056I,Mutation_A2M_A1108T,Mutation_A2M_A1379A,Mutation_A2M_A615A,Mutation_A2M_A788A,Mutation_A2M_C1321Y I848I P529H,...,Mutation_ZYX_R59L,Mutation_ZYX_S182N,Mutation_ZYX_S258A,Mutation_ZYX_S303P,Mutation_ZYX_T480I,Mutation_ZYX_V212A,Mutation_ZYX_V234V,Mutation_ZYX_V471M R489Q,Mutation_ZYX_V524A,Mutation_ZYX_V560M
0,TRAIN_0000,KIPAN,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,TRAIN_0001,SARC,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,TRAIN_0002,SKCM,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,TRAIN_0003,KIRC,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,TRAIN_0004,GBMLGG,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# Classification Model
import xgboost as xgb
from sklearn.linear_model import SGDClassifier

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_3f['SUBCLASS'] = le.fit_transform(df_3f['SUBCLASS'])

In [15]:
# training 시작
train = df_3f[df_3f.columns[2:]]
target = df_3f['SUBCLASS']

In [16]:
x_train, x_test, y_train, y_test = train_test_split(train, target, test_size=0.3)

In [17]:
# Convert to DMatrix
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [18]:
# Set parameters
params = {
    'objective': 'multi:softmax',
    'num_class': len(y_train.unique()),
    'eval_metric': 'mlogloss'
}

In [19]:
xgb_m = xgb.train(params, dtrain, num_boost_round=100)

In [20]:
y_pred = xgb_m.predict(dtest)

In [22]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 score: {f1:.2f}')

F1 score: 0.33


In [5]:
# 4th trial
df_4 = df.copy()

In [8]:
for col in df_4.columns[2:]:
    df_4[col] = df_4[col].apply(split_mutations)

In [9]:
encoded_df = pd.DataFrame()
# 각 유전체 컬럼을 순회하며 원핫 인코딩 적용
for col in df_4.columns[2:]:
    # 각 유전체 컬럼에 대해 변이 리스트를 원핫 인코딩 처리
    temp_encoded = pd.get_dummies(df_4[col].apply(lambda x: ' '.join(x)), prefix=f'Mutation_{col}')
    # 기존 데이터프레임에 병합
    encoded_df = pd.concat([encoded_df, temp_encoded], axis=1)

In [10]:
df_4f = pd.concat([df_4[['ID', 'SUBCLASS']], encoded_df], axis=1)
df_4f.head()

Unnamed: 0,ID,SUBCLASS,Mutation_A2M_,Mutation_A2M_A1043T,Mutation_A2M_A1061V I1056I,Mutation_A2M_A1108T,Mutation_A2M_A1379A,Mutation_A2M_A615A,Mutation_A2M_A788A,Mutation_A2M_C1321Y I848I P529H,...,Mutation_ZYX_R59L,Mutation_ZYX_S182N,Mutation_ZYX_S258A,Mutation_ZYX_S303P,Mutation_ZYX_T480I,Mutation_ZYX_V212A,Mutation_ZYX_V234V,Mutation_ZYX_V471M R489Q,Mutation_ZYX_V524A,Mutation_ZYX_V560M
0,TRAIN_0000,KIPAN,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,TRAIN_0001,SARC,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,TRAIN_0002,SKCM,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,TRAIN_0003,KIRC,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,TRAIN_0004,GBMLGG,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
le = LabelEncoder()
df_4['SUBCLASS'] = le.fit_transform(df_4['SUBCLASS'])

In [12]:
train = df_4[df_4.columns[2:]]
target = df_4['SUBCLASS']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(train, target, test_size = 0.3)

In [14]:
import lightgbm as lgb
dtrain = lgb.Dataset(x_train, label=y_train)
dtest = lgb.Dataset(x_test, label=y_test, reference=dtrain)

In [15]:
params = {
    'objective': 'multiclass',
    'num_class': len(le.classes_),
    'metric': 'multi_logloss'
}

In [16]:
from lightgbm import early_stopping

In [17]:
y_pred = lgb_m.predict(x_test, num_iteration=lgb_m.best_iteration)
y_pred_classes = [np.argmax(line) for line in y_pred] # convert probabilities to predicted classes

NameError: name 'lgb_m' is not defined

In [None]:
f1 = f1_score(y_test, y_pred_classes, average='weighted')
print(f'F1 score: {f1:.2f}')

In [19]:
# Step 1: Encode the target 'SUBCLASS' into numerical labels
label_encoder = LabelEncoder()
df_4['SUBCLASS'] = label_encoder.fit_transform(df_4['SUBCLASS'])

# Step 2: Convert lists to strings for mutation columns
for col in df_4.columns[2:]:
    if df_4[col].apply(lambda x: isinstance(x, list)).any():  # Check if any entry in the column is a list
        df_4[col] = df_4[col].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))  # Convert list to string

# Step 3: Convert non-numeric columns to numeric using LabelEncoder
for col in df_4.columns[2:]:
    if df_4[col].dtype == 'object':  # If the column is non-numeric (categorical)
        le = LabelEncoder()
        df_4[col] = le.fit_transform(df_4[col])

# Step 4: Split the data into features and target
train = df_4[df_4.columns[2:]]  # Features (skipping ID and SUBCLASS)
target = df_4['SUBCLASS']       # Target

# Step 5: Train-test split
x_train, x_test, y_train, y_test = train_test_split(train, target, test_size=0.3)

# Step 6: Create LightGBM dataset
dtrain = lgb.Dataset(x_train, label=y_train)
dtest = lgb.Dataset(x_test, label=y_test)

# Step 7: Set parameters for LightGBM
params = {
    'objective': 'multiclass',
    'num_class': len(label_encoder.classes_),
    'metric': 'multi_logloss'
}

# Step 8: Train the model using early_stopping as a callback
lgb_model = lgb.train(
    params,
    dtrain,
    num_boost_round=100,
    valid_sets=[dtrain, dtest],              # Track performance on both training and test sets
    valid_names=['train', 'test'],           # Name the sets for clarity in output
    callbacks=[lgb.early_stopping(stopping_rounds=10)]  # Early stopping callback
)

# Step 9: Make predictions
y_pred = lgb_model.predict(x_test, num_iteration=lgb_model.best_iteration)
y_pred_classes = [np.argmax(line) for line in y_pred]  # Convert probabilities to predicted classes

# Step 10: Calculate the F1 score
f1 = f1_score(y_test, y_pred_classes, average='weighted')
print(f'F1 Score: {f1}')

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48434
[LightGBM] [Info] Number of data points in the train set: 4340, number of used features: 2762
[LightGBM] [Info] Start training from score -4.443804
[LightGBM] [Info] Start training from score -4.044896
[LightGBM] [Info] Start training from score -2.065711
[LightGBM] [Info] Start training from score -3.750657
[LightGBM] [Info] Start training from score -3.378417
[LightGBM] [Info] Start training from score -4.974432
[LightGBM] [Info] Start training from score -2.629426
[LightGBM] [Info] Start training from score -3.351749
[LightGBM] [Info] Start training from score -2.481227
[LightGBM] [Info] Start training from score -2.920309
[LightGBM] [Info] Start training from score -3.666099
[LightGBM] [Info] Start training from score -3.222338
[LightGBM] [Info] Start training from score -3.693498
[LightGBM] [Info] Start training from score -3.

In [30]:
# 5th trial with logistic regression
df_5 = df.copy()

In [31]:
def split_mutations(mutation):
    if mutation == 'WT':
        return []
    return mutation.split()

In [32]:
for col in df_5.columns[2:]:
    df_5[col] = df_5[col].apply(split_mutations)

In [33]:
# 빈 DataFrame 생성 (추가할 원핫 인코딩된 컬럼들을 위해)
encoded_df = pd.DataFrame()

# 각 유전체 컬럼을 순회하며 원핫 인코딩 적용
for col in df_5.columns[2:]:
    # 각 유전체 컬럼에 대해 변이 리스트를 원핫 인코딩 처리
    temp_encoded = pd.get_dummies(df_5[col].apply(lambda x: ' '.join(x)), prefix=f'Mutation_{col}')
    # 기존 데이터프레임에 병합
    encoded_df = pd.concat([encoded_df, temp_encoded], axis=1)

In [34]:
df_5f = pd.concat([df_5[['ID', 'SUBCLASS']], encoded_df], axis=1)
df_5f.head()

Unnamed: 0,ID,SUBCLASS,Mutation_A2M_,Mutation_A2M_A1043T,Mutation_A2M_A1061V I1056I,Mutation_A2M_A1108T,Mutation_A2M_A1379A,Mutation_A2M_A615A,Mutation_A2M_A788A,Mutation_A2M_C1321Y I848I P529H,...,Mutation_ZYX_R59L,Mutation_ZYX_S182N,Mutation_ZYX_S258A,Mutation_ZYX_S303P,Mutation_ZYX_T480I,Mutation_ZYX_V212A,Mutation_ZYX_V234V,Mutation_ZYX_V471M R489Q,Mutation_ZYX_V524A,Mutation_ZYX_V560M
0,TRAIN_0000,KIPAN,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,TRAIN_0001,SARC,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,TRAIN_0002,SKCM,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,TRAIN_0003,KIRC,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,TRAIN_0004,GBMLGG,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [36]:
le = LabelEncoder()
df_5f['SUBCLASS'] = le.fit_transform(df_5f['SUBCLASS'])

In [37]:
for col in df_5f.columns[2:]:
    if df_5f[col].apply(lambda x: isinstance(x, list)).any():
        df_5f[col] = df_5f[col].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

In [38]:
for col in df_5f.columns[2:]:
    if df_5f[col].dtype == 'object':
        le = LabelEncoder()
        df_5f[col] = le.fit_transform(df_5f[col])

In [39]:
train = df_5f[df_5f.columns[2:]]
target = df_5f['SUBCLASS']

In [40]:
x_train, x_test, y_train, y_test = train_test_split(train, target, test_size=0.3)

In [41]:
# Step 6: Initialize Logistic Regression with OneVsRestClassifier
lr_m = make_pipeline(
    StandardScaler(),  # Standardize features
    OneVsRestClassifier(LogisticRegression(max_iter=1000))  # Use OvR strategy for multi-class classification
)

# Step 7: Train the logistic regression model
lr_m.fit(x_train, y_train)

# Step 8: Make predictions
y_pred = lr_m.predict(x_test)

# Step 9: Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 Score: {f1}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

KeyboardInterrupt

