In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import itertools

In [2]:
df = pd.read_csv('./data./train.csv')
df.head()

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TRAIN_0000,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,TRAIN_0001,SARC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,TRAIN_0002,SKCM,R895R,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,TRAIN_0003,KIRC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,TRAIN_0004,GBMLGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [7]:
t_df = pd.read_csv('./data./test.csv')
t_df.head()

Unnamed: 0,ID,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,ABCA4,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TEST_0000,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,TEST_0001,WT,WT,WT,WT,WT,R587Q,WT,WT,WT,...,WT,WT,WT,WT,WT,I383Sfs,WT,WT,WT,WT
2,TEST_0002,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,TEST_0003,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,TEST_0004,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [4]:
# 1번 시도: 문제 있는 것들을 모두 mutation으로 변형 후 모델링 돌려보자
# df 복제하기
df_1 = df.copy()

In [5]:
# mutation으로 변경하기
df_1 = df_1.applymap(lambda x: 'Mutation' if x != 'WT' else x)

df_1.head()

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,Mutation,Mutation,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,Mutation,Mutation,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,Mutation,Mutation,Mutation,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,Mutation,Mutation,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,Mutation,Mutation,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [6]:
# 1st Trial : RandomForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [7]:
# Feature, Target split
X = df_1.drop(columns=['SUBCLASS'])
y = df_1['SUBCLASS']

In [8]:
# label encoding이용해서 범주형 데이터를 수치형으로 변환
# label_encoder = LabelEncoder()

In [9]:
# X 각 열에 대해 : LAbel Encoding 적용
label_encoders = {}
for col in X.columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [10]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Random Forest Train
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [12]:
# 예측 및 평가
y_pred = rf_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 score: {f1:.2f}')

F1 score: 1.00


In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

# 교차 검증을 사용하여 F1 스코어 평가
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='f1_weighted')
print(f'Cross-validated F1 Score: {cv_scores.mean():.2f}')

# 예측 및 Confusion Matrix 확인
y_pred = rf_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Cross-validated F1 Score: 1.00
Confusion Matrix:
 [[1241]]


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score

# Encode 'WT' and 'Mutation' values
df_encoded = df.copy()
for col in df_encoded.columns[2:]:  # Skip 'ID' and 'SUBCLASS' columns
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Encode the target variable 'SUBCLASS'
label_encoder = LabelEncoder()
df_encoded['SUBCLASS'] = label_encoder.fit_transform(df_encoded['SUBCLASS'])

# Define features (X) and target (y)
X = df_encoded.drop(['ID', 'SUBCLASS'], axis=1)
y = df_encoded['SUBCLASS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 score: {f1:.2f}')

F1 score: 0.28


In [9]:
# Load the test data (t_df)
# t_df = pd.read_csv("test.csv")  # Replace with your actual file path

# Manually encode 'WT' and 'Mutation' for the test set based on training data's encoding
t_df_encoded = t_df.copy()
for col in t_df_encoded.columns[1:]:  # Skip 'ID' column
    t_df_encoded[col] = t_df_encoded[col].map({'WT': 0, 'Mutation': 1})

# Prepare the test features
X_test_final = t_df_encoded.drop(['ID'], axis=1)

# Predict the SUBCLASS for the test data using the trained model
t_df_encoded['SUBCLASS'] = rf_model.predict(X_test_final)

# Decode the predicted 'SUBCLASS' back to original labels using the trained label_encoder
t_df_encoded['SUBCLASS'] = label_encoder.inverse_transform(t_df_encoded['SUBCLASS'])

# Create the submission file
submission = t_df[['ID']].copy()
submission['SUBCLASS'] = t_df_encoded['SUBCLASS']

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values