## Преобразование датафрейма

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_table('labeledEligibilitySample1000000.csv', header=None)

In [3]:
data.shape

(1000000, 2)

In [4]:
data.sample(10)

Unnamed: 0,0,1
897851,__label__1,study interventions are Lenalidomide . recurre...
262581,__label__0,study interventions are Antibodies . stage iv ...
267168,__label__0,study interventions are Antibodies . noncontig...
693790,__label__1,study interventions are intraoperative complic...
203156,__label__0,study interventions are Oxaliplatin . neoplasm...
172128,__label__0,study interventions are Vincristine . brain tu...
161184,__label__0,study interventions are polymerase chain react...
260251,__label__0,study interventions are Pembrolizumab . stage ...
174777,__label__0,study interventions are Gemcitabine . cholangi...
39444,__label__0,study interventions are Immunoglobulins . loca...


In [3]:
clmned = pd.DataFrame(np.array(data).reshape(1000000,2), columns=['label', 'describe'])

In [4]:
clmned['study'], clmned['condition'] = clmned['describe'].str.split('.', 1).str

In [5]:
clmned=clmned.drop(['describe'], axis=1)

In [8]:
clmned.head(6)

Unnamed: 0,label,study,condition
0,__label__0,study interventions are recombinant CD40-ligand,melanoma skin diagnosis and no active cns met...
1,__label__0,study interventions are Liposomal doxorubicin,colorectal cancer diagnosis and cardiovascular
2,__label__0,study interventions are BI 836909,multiple myeloma diagnosis and indwelling cen...
3,__label__0,study interventions are Immunoglobulins,recurrent fallopian tube carcinoma diagnosis ...
4,__label__0,study interventions are Paclitaxel,stage ovarian cancer diagnosis and patients m...
5,__label__0,"study interventions are Antibodies, Monoclonal",recurrent verrucous carcinoma of the oral cav...


In [6]:
clmned['qualification']=clmned['label'].str.extract('(\d)', expand=True).astype(int)
clmned=clmned.drop(['label'], axis=1)

In [10]:
clmned.sample(6)

Unnamed: 0,study,condition,qualification
235017,study interventions are Bleomycin,lymphoma diagnosis and patient characteristics,0
265059,study interventions are Capecitabine,neuroendocrine tumors diagnosis and ineligibl...,0
567551,study interventions are Paclitaxel,breast cancer diagnosis and pregnant or lacta...,1
642721,study interventions are Fludarabine phosphate,mantle cell lymphoma diagnosis and lansky pla...,1
838907,study interventions are Succinylcholine,stage iiib gallbladder cancer diagnosis and p...,1
446441,study interventions are Sirolimus,malignant pancreatic glucagonoma diagnosis an...,0


In [7]:
clmned['interventions'] = clmned['study'].str.extract('study interventions are (.+)')

In [8]:
clmned=clmned.drop(['study'], axis=1)

In [13]:
clmned.sample(6)

Unnamed: 0,condition,qualification,interventions
741332,stage ii childhood anaplastic large cell lymp...,1,Etoposide
158174,non hodgkins lymphoma diagnosis and known pri...,0,Mycophenolic Acid
907900,recurrent marginal zone lymphoma diagnosis an...,1,Alemtuzumab
908801,nodal marginal zone cell lymphoma diagnosis a...,1,Cyclosporins
515360,stage iv adult hodgkin lymphoma diagnosis and...,1,Doxorubicin
37715,unspecified adult solid tumor protocol specif...,0,questionnaire administration


Теперь хорошо бы еще подробить колонку 'condition'.

In [9]:
clmned['diagnosis'] = clmned['condition'].str.extract('(.+) diagnosis.+')

In [10]:
clmned['recurrent'] = pd.np.where(clmned.condition.str.contains("recurrent"), "1", "0").astype(int)

In [11]:
clmned['conditions'] = clmned['condition'].str.extract('.+ and (.+)')

In [12]:
clmned = clmned.drop(['condition'], axis=1)

In [18]:
clmned.head(6)

Unnamed: 0,qualification,interventions,diagnosis,recurrent,conditions
0,0,recombinant CD40-ligand,melanoma skin,0,no active cns metastases by ct scan or mri
1,0,Liposomal doxorubicin,colorectal cancer,0,cardiovascular
2,0,BI 836909,multiple myeloma,0,indwelling central venous cateder or willingne...
3,0,Immunoglobulins,recurrent fallopian tube carcinoma,1,patients are allowed to receive but are not re...
4,0,Paclitaxel,stage ovarian cancer,0,patients must have recovered from the effects ...
5,0,"Antibodies, Monoclonal",recurrent verrucous carcinoma of the oral cavity,1,must have undergone radiotherapy as component ...


## Случайный бейзлайн

In [13]:
import random

In [14]:
from sklearn.metrics import roc_auc_score

In [22]:
clmned['random'] = [random.randint(0, 1) for i in range(len(clmned))]

In [23]:
roc_auc_score(clmned['random'], clmned['qualification'])

0.5010670013146523

## Неслучайный бейзлайн

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
laben = LabelEncoder()
clmned['interventions'] = laben.fit_transform(clmned['interventions'])

In [18]:
clmned['diagnosis'] = laben.fit_transform(clmned['diagnosis'].astype(str))

In [45]:
clmned.head()

Unnamed: 0,qualification,interventions,diagnosis,recurrent,conditions,random
0,0,14623,5504,0,no active cns metastases by ct scan or mri,1
1,0,6722,2432,0,cardiovascular,0
2,0,1416,5966,0,indwelling central venous cateder or willingne...,1
3,0,5891,8595,1,patients are allowed to receive but are not re...,0
4,0,8671,11274,0,patients must have recovered from the effects ...,0


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [20]:
X = clmned.iloc[:,4]
y = clmned.iloc[:,0]

In [21]:
vectorizer = TfidfVectorizer()

In [22]:
X = vectorizer.fit_transform(X.astype('U'))

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_rain, X_valid, y_rain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
c_es = [0.1, 1, 10, 20]

In [26]:
for c in c_es:
    logreg = LogisticRegression(C=c)
    logreg.fit(X_train, y_train)
    score = roc_auc_score(logreg.predict(X_valid), y_valid)
    print(c, score)

0.1 0.8387039290237873
1 0.8499545748284155
10 0.8538405274582047
20 0.8542382210334152


In [28]:
logred10 = LogisticRegression(C=10)
logred10.fit(X_train, y_train)
y_predicted = logred10.predict(X_test)

In [29]:
roc_auc_score(y_test, y_predicted)

0.8528342790948445