In [160]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

In [161]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Grade,Project,Case_ID,Gender,Age_at_diagnosis,Primary_Diagnosis,Race,IDH1,TP53,ATRX,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,LGG,TCGA-LGG,TCGA-DU-8164,Male,51 years 108 days,"Oligodendroglioma, NOS",white,MUTATED,NOT_MUTATED,NOT_MUTATED,...,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
1,LGG,TCGA-LGG,TCGA-QH-A6CY,Male,38 years 261 days,Mixed glioma,white,MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
2,LGG,TCGA-LGG,TCGA-HW-A5KM,Male,35 years 62 days,"Astrocytoma, NOS",white,MUTATED,MUTATED,MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
3,LGG,TCGA-LGG,TCGA-E1-A7YE,Female,32 years 283 days,"Astrocytoma, anaplastic",white,MUTATED,MUTATED,MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,MUTATED,NOT_MUTATED
4,LGG,TCGA-LGG,TCGA-S9-A6WG,Male,31 years 187 days,"Astrocytoma, anaplastic",white,MUTATED,MUTATED,MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED


### Замена категориальных признаков

#### Показатели

In [163]:
cat_col = df.loc[:, 'IDH1':].columns
def cat_to_int(raw):
    for col in cat_col:
        if raw[col] == 'MUTATED':
            raw[col] = 1
        if raw[col] == 'NOT_MUTATED':
            raw[col] = 0
    return raw

#### Раса

In [164]:
#df['Race'].value_counts()

'''
==> 
white                               766
black or african american            59
not reported                         18
asian                                14
--                                    4
american indian or alaska native      1

Возможно стоит убрать последние две категории из-за неинформативности
'''

'\n==> \nwhite                               766\nblack or african american            59\nnot reported                         18\nasian                                14\n--                                    4\namerican indian or alaska native      1\n\nВозможно стоит убрать последние две категории из-за неинформативности\n'

In [165]:
races = list(df.Race.value_counts().index)
races_dict = {race: id for id, race in enumerate(races)}

def race_to_int(raw):
    raw['Race'] = races_dict[raw['Race']]
    return raw

#### Пол

In [166]:
#df['Gender'].value_counts()

'''
==>
Male      499
Female    359
--          4
'''

'\n==>\nMale      499\nFemale    359\n--          4\n'

In [167]:
def gender_to_int(raw):
    if raw['Gender'] == 'Male':
        raw['Gender'] = 1
    if raw['Gender'] == 'Female':
        raw['Gender'] = 0
    else:
        raw['Gender'] = 2
    return raw

#### Возраст

In [168]:
def age_to_days(raw):
    age_str = raw['Age_at_diagnosis']
    parts = age_str.split(' ')
    if len(parts) !=4:
        if parts[0] == '--':
            raw['Age_at_diagnosis'] = None
        else:
            raw['Age_at_diagnosis'] = int(parts[0])
    else:
   
        # Разделение строки на года и дн
        years = int(parts[0])
        days = int(parts[2])

        # Перевод в дни (предполагаем, что в году 365 дней)
        raw['Age_at_diagnosis'] = years * 365 + days
    return raw

#### Диагноз

In [169]:
# diagnosis = list(df.Primary_Diagnosis.value_counts().index)
# diagnosis_dict = {diag: id for id, diag in enumerate(diagnosis)}
# diagnosis_dict

'''
==>
{'Glioblastoma': 0,
 'Astrocytoma, anaplastic': 1,
 'Mixed glioma': 2,
 'Oligodendroglioma, NOS': 3,
 'Oligodendroglioma, anaplastic': 4,
 'Astrocytoma, NOS': 5,
 '--': 6}
'''

"\n==>\n{'Glioblastoma': 0,\n 'Astrocytoma, anaplastic': 1,\n 'Mixed glioma': 2,\n 'Oligodendroglioma, NOS': 3,\n 'Oligodendroglioma, anaplastic': 4,\n 'Astrocytoma, NOS': 5,\n '--': 6}\n"

In [170]:
diagnosis = list(df.Primary_Diagnosis.value_counts().index)
diagnosis_dict = {diag: id for id, diag in enumerate(diagnosis)}

def diag_to_int(raw):
    raw['Primary_Diagnosis'] = diagnosis_dict[raw['Primary_Diagnosis']]
    return raw

#### Grade

In [171]:
df.Grade.value_counts()

LGG    499
GBM    363
Name: Grade, dtype: int64

In [172]:
def grade_to_int(raw):
    if raw['Grade'] == 'LGG':
        raw['Grade'] = 0
    if raw['Grade'] == 'GBM':
        raw['Grade'] = 1
    return raw

#### Замена значений

In [173]:
df = df.apply(race_to_int, axis=1)
df = df.apply(gender_to_int, axis=1)
df = df.apply(age_to_days, axis=1)
df = df.apply(diag_to_int, axis=1)
df.loc[:, 'IDH1':] = df.loc[:, 'IDH1':].apply(cat_to_int, axis=1)
df = df.apply(grade_to_int, axis=1)
df['Age_at_diagnosis'].fillna(np.mean(df['Age_at_diagnosis']), inplace=True)

  df.loc[:, 'IDH1':] = df.loc[:, 'IDH1':].apply(cat_to_int, axis=1)


#### Удалить ненужные столбцы

In [174]:
df.drop(columns=['Project', 'Case_ID'], axis=1, inplace=True)

### Обучение модели

#### Logistic Regrassion

In [181]:
X = df.drop('Grade', axis=1)
y = df['Grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

# Оценка модели
f1 = f1_score(y_test, predictions)
print(f'f1: {f1}')


f1: 0.9777777777777777


#### KNN

In [182]:

X = df.drop('Grade', axis=1)
y = df['Grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

predictions = knn.predict(X_test)

f1 = f1_score(y_test, predictions)
print(f'F1 Score: {f1}')

F1 Score: 0.875


#### Gradient Boosting

In [188]:
X = df.drop('Grade', axis=1)
y = df['Grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm.fit(X_train, y_train)

predictions = gbm.predict(X_test)

f1 = f1_score(y_test, predictions) 
print(f'F1 Score: {f1}')


F1 Score: 1.0
