In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import mplcyberpunk
# plt.style.use("cyberpunk")

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
original = pd.read_csv("abalone.csv")

In [3]:
train

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...,...
90610,90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6
90611,90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9
90612,90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6
90613,90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6


## Combining Datasets

In [4]:
train = train.drop(['id'], axis = 1)
train.columns = original.columns
train = pd.concat([train, original], ignore_index=True).reset_index().rename(columns={'index':'id'})
test.columns = train.columns[:-1]
combined = pd.concat([train, test])

In [5]:
combined_dummies = pd.get_dummies(combined['Sex'], prefix='Sex', drop_first=False)
combined = pd.concat([combined, combined_dummies], axis=1)

In [6]:
train = combined.iloc[:len(train)]
test = combined.iloc[len(train):]

combined

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11.0,1,0,0
1,1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11.0,1,0,0
2,2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6.0,0,1,0
3,3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10.0,0,0,1
4,4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60406,151021,I,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500,,0,1,0
60407,151022,F,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050,,1,0,0
60408,151023,I,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650,,0,1,0
60409,151024,F,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350,,1,0,0


## EDA

In [7]:
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['%missing'] = df.isnull().sum().values / len(df)*100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['count'] = desc['count'].values
    summ['mean'] = desc['mean'].values
    summ['std'] = desc['std'].values
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['first value'] = df.iloc[0].values
    summ['second value'] = df.iloc[1].values
    summ['third value'] = df.iloc[2].values

    return summ

In [8]:
summary(train)

data shape: (94792, 13)


Unnamed: 0,data type,#missing,%missing,#unique,count,mean,std,min,max,first value,second value,third value
id,int64,0,0.0,94792,94792.0,47395.5,27364.237696,0.0,94791.0,0,1,2
Sex,object,0,0.0,3,94792.0,,,,,F,F,I
Length,float64,0,0.0,157,94792.0,0.517402,0.118308,0.075,0.815,0.55,0.63,0.16
Diameter,float64,0,0.0,126,94792.0,0.401952,0.098088,0.055,0.65,0.43,0.49,0.11
Height,float64,0,0.0,90,94792.0,0.135643,0.038193,0.0,1.13,0.15,0.145,0.025
Whole weight,float64,0,0.0,3205,94792.0,0.790785,0.459231,0.002,2.8255,0.7715,1.13,0.021
Shucked weight,float64,0,0.0,1806,94792.0,0.341597,0.205267,0.001,1.488,0.3285,0.458,0.0055
Viscera weight,float64,0,0.0,983,94792.0,0.169914,0.101334,0.0005,0.76,0.1465,0.2765,0.003
Shell weight,float64,0,0.0,1132,94792.0,0.226468,0.130639,0.0015,1.005,0.24,0.32,0.005
Rings,float64,0,0.0,28,94792.0,9.707233,3.178704,1.0,29.0,11.0,11.0,6.0


In [9]:
summary(test)

data shape: (60411, 13)


Unnamed: 0,data type,#missing,%missing,#unique,count,mean,std,min,max,first value,second value,third value
id,int64,0,0.0,60411,60411.0,120820.0,17439.297893,90615.0,151025.0,90615,90616,90617
Sex,object,0,0.0,3,60411.0,,,,,M,M,M
Length,float64,0,0.0,148,60411.0,0.517428,0.117609,0.075,0.8,0.645,0.58,0.56
Diameter,float64,0,0.0,130,60411.0,0.401961,0.09747,0.055,0.65,0.475,0.46,0.42
Height,float64,0,0.0,85,60411.0,0.135751,0.038175,0.0,1.095,0.155,0.16,0.14
Whole weight,float64,0,0.0,3037,60411.0,0.790062,0.457591,0.002,2.8255,1.238,0.983,0.8395
Shucked weight,float64,0,0.0,1747,60411.0,0.341227,0.204221,0.001,1.488,0.6185,0.4785,0.3525
Viscera weight,float64,0,0.0,960,60411.0,0.169419,0.10072,0.0005,0.6415,0.3125,0.2195,0.1845
Shell weight,float64,0,0.0,1089,60411.0,0.226125,0.129826,0.0015,1.004,0.3005,0.275,0.2405
Rings,float64,60411,100.0,0,0.0,,,,,,,


In [10]:
print("helki")

helki


In [11]:
train.hist(figsize = (14,8), bins = 30)
plt.tight_layout()
plt.show()

: 