In [15]:
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import shutil
import os
import numpy as np
from tqdm import tqdm, tqdm_notebook
from sklearn.utils import class_weight, compute_class_weight
from PIL import Image

"breast/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-001.png"

## More balanced split

In [11]:
data = glob.glob("/media/yannis/data/cours/PPD/breast/*/*/*/*/*/*.png")
print(len(data))

7909


In [12]:
dataframe = []
for d in data:
    label = d.split("/")[7]
    dis = d.split("/")[9]
    seq = d.split("/")[10]
    zoom = d.split("/")[11]
    name = d.split("/")[12]
    
    dataframe.append({
        "name" : name,
        "label" : 1 if label == "malignant" else 0,
        "label_name" : label,
        "type_label" : dis,
        "zoom" : zoom,
        "seq" : seq,
        "path" : d
    })

In [13]:
df = pd.DataFrame(dataframe)
seq_split = df.drop_duplicates("seq", keep="first")

train, test = train_test_split(seq_split, stratify=seq_split["type"], test_size = 0.15, random_state=2)

train_seq = list(set(train["seq"]))
test_seq = list(set(test["seq"]))

In [47]:
train["type"].value_counts()

ductal_carcinoma       32
fibroadenoma            8
mucinous_carcinoma      8
tubular_adenoma         6
papillary_carcinoma     5
lobular_carcinoma       4
phyllodes_tumor         3
adenosis                3
Name: type, dtype: int64

In [48]:
test["type"].value_counts()

ductal_carcinoma       6
fibroadenoma           2
adenosis               1
mucinous_carcinoma     1
papillary_carcinoma    1
tubular_adenoma        1
lobular_carcinoma      1
Name: type, dtype: int64

In [49]:
add = train[train["type"] == "phyllodes_tumor"].reset_index().loc[0]['seq']
test_seq.append(add)
train_seq.remove(add)
add = train[train["type"] == "mucinous_carcinoma"].reset_index().loc[0]['seq']
test_seq.append(add)
train_seq.remove(add)
rmove = test[test["type"] == "ductal_carcinoma"].reset_index().loc[3]['seq']
test_seq.remove(rmove)
train_seq.append(rmove)
# rmove = test[test["type"] == "fibroadenoma"].reset_index().loc[0]['seq']
# test_seq.remove(rmove)
# train_seq.append(rmove)

In [50]:
train = df[df["seq"].isin(train_seq)]
test = df[df["seq"].isin(test_seq)]

In [57]:
train.drop_duplicates("seq", keep="first")["type"].value_counts()

ductal_carcinoma       33
fibroadenoma            8
mucinous_carcinoma      7
tubular_adenoma         6
papillary_carcinoma     5
lobular_carcinoma       4
adenosis                3
phyllodes_tumor         2
Name: type, dtype: int64

In [58]:
test.drop_duplicates("seq", keep="first")["type"].value_counts()

ductal_carcinoma       5
fibroadenoma           2
mucinous_carcinoma     2
phyllodes_tumor        1
adenosis               1
papillary_carcinoma    1
tubular_adenoma        1
lobular_carcinoma      1
Name: type, dtype: int64

In [59]:
train.drop_duplicates("seq", keep="first")["label"].value_counts()

1    49
0    19
Name: label, dtype: int64

In [60]:
test.drop_duplicates("seq", keep="first")["label"].value_counts()

1    9
0    5
Name: label, dtype: int64

In [61]:
print(len(train))
print(len(test))

6324
1585


In [62]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

In [67]:
os.makedirs("data/original/train", exist_ok=True)
for i in tqdm(list(train.iterrows())):
    x = i[1]["path"]
    l = i[1]["label"]
    shutil.copy(x, "data/original/train/"+str(l)+"/"+x.split("/")[-1])

100%|██████████| 6324/6324 [00:22<00:00, 275.77it/s]


In [68]:
os.makedirs("data/original/test", exist_ok=True)
for i in tqdm(list(test.iterrows())):
    x = i[1]["path"]
    l = i[1]["label"]
    shutil.copy(x, "data/original/test/"+str(l)+"/"+x.split("/")[-1])

100%|██████████| 1585/1585 [00:05<00:00, 267.15it/s]


In [74]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train["label"]),
                                                 train["label"])

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

Unnamed: 0,label,label_name,name,path,seq,type,zoom
0,0,benign,SOB_B_A-14-22549CD-100-014.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,adenosis,100X
1,0,benign,SOB_B_A-14-22549CD-100-001.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,adenosis,100X
2,0,benign,SOB_B_A-14-22549CD-100-002.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,adenosis,100X
3,0,benign,SOB_B_A-14-22549CD-100-003.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,adenosis,100X
4,0,benign,SOB_B_A-14-22549CD-100-004.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,adenosis,100X


## Old Split

In [21]:
seq_split = list(set(df["seq"]))
train, test = train_test_split(seq_split, test_size = 0.2, random_state=2)

In [22]:
train = df[df["seq"].isin(train)]

In [23]:
test = df[df["seq"].isin(test)]

In [24]:
train["label"].value_counts()

1    4360
0    1824
Name: label, dtype: int64

In [25]:
test["label"].value_counts()

1    1069
0     656
Name: label, dtype: int64

In [26]:
test.drop_duplicates("seq", keep="first")

Unnamed: 0,label,label_name,name,path,seq,type,zoom
0,0,benign,SOB_B_A-14-22549AB-100-014.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549AB,SOB,100X
253,0,benign,SOB_B_A-14-22549G-100-014.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549G,SOB,100X
938,0,benign,SOB_B_F-14-23060AB-100-001.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_F_14-23060AB,SOB,100X
1076,0,benign,SOB_B_F-14-23222AB-100-001.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_F_14-23222AB,SOB,100X
1332,0,benign,SOB_B_F-14-9133-100-015.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_F_14-9133,SOB,100X
2285,0,benign,SOB_B_TA-14-19854C-100-001.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_TA_14-19854C,SOB,100X
2414,0,benign,SOB_B_TA-14-3411F-100-001.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_TA_14-3411F,SOB,100X
2525,1,malignant,SOB_M_DC-14-10926-100-001.png,/media/yannis/data/cours/PPD/breast/malignant/...,SOB_M_DC_14-10926,SOB,100X
3404,1,malignant,SOB_M_DC-14-14946-100-014.png,/media/yannis/data/cours/PPD/breast/malignant/...,SOB_M_DC_14-14946,SOB,100X
3535,1,malignant,SOB_M_DC-14-15572-100-001.png,/media/yannis/data/cours/PPD/breast/malignant/...,SOB_M_DC_14-15572,SOB,100X


In [60]:
train.to_csv("train.csv", index=False)

In [61]:
test.to_csv("test.csv", index=False)

In [62]:
for x in list(train["path"]):
    shutil.copy(x, "data/train/"+x.split("/")[-1])

In [63]:
for x in list(test["path"]):
    shutil.copy(x, "data/test/"+x.split("/")[-1])

In [64]:
len(train)

6461

In [65]:
train, test = train_test_split(df, stratify = df["label"], test_size = 0.15, random_state=1)

In [66]:
len(train)

6722

In [67]:
len(test)

1187

In [68]:
test["label"].value_counts(normalize=True)

1    0.686605
0    0.313395
Name: label, dtype: float64

In [69]:
train["label"].value_counts(normalize=True)

1    0.686403
0    0.313597
Name: label, dtype: float64

In [70]:
test["zoom"].value_counts(normalize=True)

100X    0.265375
40X     0.256108
200X    0.240944
400X    0.237574
Name: zoom, dtype: float64

In [71]:
train.to_csv("train.csv", index=False)

In [72]:
test.to_csv("test.csv", index=False)

In [73]:
soumissions = test[['name','label']]
soumissions = shuffle(soumissions)

In [74]:
soumissions.to_csv("soumission_real.csv", index=False)

In [75]:
for x in list(train["path"]):
    shutil.copy(x, "data/train/"+x.split("/")[-1])

In [76]:
for x in list(test["path"]):
    shutil.copy(x, "data/test/"+x.split("/")[-1])

In [77]:
len(soumissions)

1187

### folder patch3

In [9]:
all_data = pd.read_csv("train.csv").append(pd.read_csv("test.csv"))

In [41]:
new_seq = []
for row in tqdm(all_data.iterrows()):
    size = (224,224)
    image = Image.open("data/original/all/" + row[1]["name"])

    img1 = image.crop((20, 0, 460+20, 460)).resize(size)
    img2 = image.crop((120, 0, 580, 460)).resize(size)
    img3 = image.crop((240-20, 0, 700-20, 460)).resize(size)

    
    nm1 = "p1_"+row[1]["name"]
    nm2 = "p2_"+row[1]["name"]
    nm3 = "p3_"+row[1]["name"]
    
    img1.save("data/original/p3/"+nm1)
    img2.save("data/original/p3/"+nm2)
    img3.save("data/original/p3/"+nm3)

    d = dict(row[1])
    d["name"] = nm1
    new_seq.append(d)
    
    d = dict(row[1])
    d["name"] = nm2
    new_seq.append(d)
    
    d = dict(row[1])
    d["name"] = nm3
    new_seq.append(d)

7909it [05:41, 23.17it/s]


In [43]:
df = pd.DataFrame(new_seq)

In [45]:
df.to_csv("all_data_p3.csv", index=False)

### add categorical

In [3]:
cat = {
    "ductal_carcinoma":0,
    "fibroadenoma":1,
    "mucinous_carcinoma":2,
    "tubular_adenoma":3,
    "papillary_carcinoma":4,
    "lobular_carcinoma":5,
    "phyllodes_tumor":6, 
    "adenosis" :7, 
}
tp = { x[1]["name"]:x[1]["type_label"] for x in df.iterrows() }

In [25]:
train = pd.read_csv("train.csv")
train["type_label"] = [ tp[x] for x in train['name']]
train["label_cat"] = [ cat[x] for x in train["type_label"]]

In [26]:
train.head()

Unnamed: 0,label,label_name,name,path,seq,type,zoom,type_label,label_cat
0,0,benign,SOB_B_A-14-22549CD-100-014.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,real,100X,adenosis,7
1,0,benign,SOB_B_A-14-22549CD-100-001.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,real,100X,adenosis,7
2,0,benign,SOB_B_A-14-22549CD-100-002.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,real,100X,adenosis,7
3,0,benign,SOB_B_A-14-22549CD-100-003.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,real,100X,adenosis,7
4,0,benign,SOB_B_A-14-22549CD-100-004.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549CD,real,100X,adenosis,7


In [27]:
train.to_csv("train.csv", index=False)

In [31]:
test = pd.read_csv("test.csv")
test["label_cat"] = [ cat[x] for x in test["type"]]

In [32]:
test.head()

Unnamed: 0,label,label_name,name,path,seq,type,zoom,label_cat
0,0,benign,SOB_B_A-14-22549AB-100-014.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549AB,adenosis,100X,7
1,0,benign,SOB_B_A-14-22549AB-100-001.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549AB,adenosis,100X,7
2,0,benign,SOB_B_A-14-22549AB-100-002.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549AB,adenosis,100X,7
3,0,benign,SOB_B_A-14-22549AB-100-003.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549AB,adenosis,100X,7
4,0,benign,SOB_B_A-14-22549AB-100-004.png,/media/yannis/data/cours/PPD/breast/benign/SOB...,SOB_B_A_14-22549AB,adenosis,100X,7


In [33]:
test.to_csv("test.csv", index=False)

In [3]:
train = pd.read_csv("train.csv")

In [4]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train["label_cat"]),
                                                 train["label_cat"])

In [5]:
class_weights

array([0.26589304, 0.99309045, 1.30877483, 1.80892449, 1.68191489,
       1.57157058, 3.62614679, 2.44736842])

### train p3

In [54]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [57]:
p1 = train.copy()
p1["name"] = [ "p1_"+x for x in p1["name"] ]

p2 = train.copy()
p2["name"] = [ "p2_"+x for x in p2["name"] ]

p3 = train.copy()
p3["name"] = [ "p3_"+x for x in p3["name"] ]

train_p3 = p1.append(p2)
train_p3 = train_p3.append(p3)
train_p3.to_csv("train_p3.csv", index=False)

In [59]:
p1 = test.copy()
p1["name"] = [ "p1_"+x for x in p1["name"] ]

p2 = test.copy()
p2["name"] = [ "p2_"+x for x in p2["name"] ]

p3 = test.copy()
p3["name"] = [ "p3_"+x for x in p3["name"] ]

test_p3 = p1.append(p2)
test_p3 = test_p3.append(p3)
test_p3.to_csv("test_p3.csv", index=False)