In [1]:
import pandas as pd
import math

pd.set_option("display.max_rows", 60)
pd.set_option("display.min_rows", 30)

In [2]:
NUM_FEATURES = 5000
NUM_LABELS = 3993

In [3]:
df = pd.read_csv("data/train.csv", index_col=0)
df = df.fillna("")
df

Unnamed: 0_level_0,labels,features
ex_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4465211149124912651482,0:0.084556 1:0.138594 2:0.094304 3:0.195764 4:...
1,78808586,0:0.050734 1:0.762265 2:0.754431 3:0.065255 4:...
2,4575775796409391158,0:0.101468 1:0.138594 2:0.377215 3:0.130509 4:...
3,1726546931704,0:0.186024 1:0.346484 2:0.141456 3:0.195764 4:...
4,4035081017105217313183,0:0.135290 1:0.277187 2:0.141456 3:0.065255 4:...
5,1743793803814494936771328,0:0.101468 1:0.450429 2:0.141456 3:0.195764 4:...
6,592595617694923,0:0.084556 1:1.628475 2:0.330063 3:0.783057 4:...
7,4464995006718041210,0:0.050734 1:0.866210 2:0.330063 3:0.195764 5:...
8,64180611,0:0.050734 1:0.277187 2:0.188608 3:0.195764 4:...
9,5051098131018183263,0:0.355137 1:2.494685 2:0.660127 3:0.913566 4:...


## Fixing incorrectly stored no-label rows

In [4]:
for i, r in df.iterrows():
    if ":" in r["labels"]:  # incorrectly stored (no correct labels)
        print(i)
        
        # add misplaced "labels" string to beginning of "features"
        df.at[i, "features"] = r["labels"] + " " + r["features"]
        
        # make labels blank
        df.at[i, "labels"] = ""

509
1527
1939
1953
4029
4426
4643
4727
5231
5762
6295
6333
6703
7083
9477
9675
9999
10488
10736
10910
11674
12280
12599
13147
14167
14722


## One-Hot Encoding

In [5]:
#rows = [0 for i in range(NUM_FEATURES + NUM_LABELS)]
rows = []
for i, r in df.iterrows():
    row = [0 for i in range(NUM_FEATURES + NUM_LABELS)]
    
    # get feature dict
    fList = r["features"].split(" ")
    fDict = {}
    for f in fList:
        k, v = f.split(":")
        row[int(k)] = float(v)
    
    # get label list
    lList = r["labels"].split(",")
    for l in lList:
        row[NUM_FEATURES + int(l)] = 1
    
    rows.append(row)

    if i == 9:
        break

df_clean = pd.DataFrame(rows, index=range(10), columns=["f_{}".format(i) for i in range(NUM_FEATURES)]+["l_{}".format(i) for i in range(NUM_LABELS)])
df_clean

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,l_3983,l_3984,l_3985,l_3986,l_3987,l_3988,l_3989,l_3990,l_3991,l_3992
0,0.084556,0.138594,0.094304,0.195764,0.612552,0.106491,0.137765,0.145839,0.30461,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.050734,0.762265,0.754431,0.065255,0.35003,0.0,0.137765,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.101468,0.138594,0.377215,0.130509,0.175015,0.745434,0.137765,1.020873,1.218441,0.309297,...,0,0,0,0,0,0,0,0,0,0
3,0.186024,0.346484,0.141456,0.195764,0.262522,0.425962,0.551062,0.437517,0.60922,0.618594,...,0,0,0,0,0,0,0,0,0,0
4,0.13529,0.277187,0.141456,0.065255,1.137597,0.106491,0.137765,0.583356,0.152305,0.154649,...,0,0,0,0,0,0,0,0,0,0
5,0.101468,0.450429,0.141456,0.195764,0.35003,0.851925,0.275531,0.145839,0.30461,1.237188,...,0,0,0,0,0,0,0,0,0,0
6,0.084556,1.628475,0.330063,0.783057,0.35003,2.023321,0.826593,1.020873,9.899832,0.618594,...,0,0,0,0,0,0,0,0,0,0
7,0.050734,0.86621,0.330063,0.195764,0.0,3.088227,0.275531,0.291678,0.30461,0.618594,...,0,0,0,0,0,0,0,0,0,0
8,0.050734,0.277187,0.188608,0.195764,0.262522,0.745434,0.275531,0.291678,0.456915,0.618594,...,0,0,0,0,0,0,0,0,0,0
9,0.355137,2.494685,0.660127,0.913566,0.612552,0.319472,0.413296,1.312552,29.394884,0.927891,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_clean.to_csv("data/train_cleaned.csv", index=False)

In [7]:
pd.read_csv("data/train_cleaned.csv")

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,l_3983,l_3984,l_3985,l_3986,l_3987,l_3988,l_3989,l_3990,l_3991,l_3992
0,0.084556,0.138594,0.094304,0.195764,0.612552,0.106491,0.137765,0.145839,0.30461,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.050734,0.762265,0.754431,0.065255,0.35003,0.0,0.137765,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.101468,0.138594,0.377215,0.130509,0.175015,0.745434,0.137765,1.020873,1.218441,0.309297,...,0,0,0,0,0,0,0,0,0,0
3,0.186024,0.346484,0.141456,0.195764,0.262522,0.425962,0.551062,0.437517,0.60922,0.618594,...,0,0,0,0,0,0,0,0,0,0
4,0.13529,0.277187,0.141456,0.065255,1.137597,0.106491,0.137765,0.583356,0.152305,0.154649,...,0,0,0,0,0,0,0,0,0,0
5,0.101468,0.450429,0.141456,0.195764,0.35003,0.851925,0.275531,0.145839,0.30461,1.237188,...,0,0,0,0,0,0,0,0,0,0
6,0.084556,1.628475,0.330063,0.783057,0.35003,2.023321,0.826593,1.020873,9.899832,0.618594,...,0,0,0,0,0,0,0,0,0,0
7,0.050734,0.86621,0.330063,0.195764,0.0,3.088227,0.275531,0.291678,0.30461,0.618594,...,0,0,0,0,0,0,0,0,0,0
8,0.050734,0.277187,0.188608,0.195764,0.262522,0.745434,0.275531,0.291678,0.456915,0.618594,...,0,0,0,0,0,0,0,0,0,0
9,0.355137,2.494685,0.660127,0.913566,0.612552,0.319472,0.413296,1.312552,29.394884,0.927891,...,0,0,0,0,0,0,0,0,0,0
