# Steps

1. Communication with domain side about data more times
2. Gain domain knowledge
3. Communication with engineers who collected the data
4. Understand the structure of data
5. Understand the quality of data
6. Understand whether the data is suitable for prediction
7. Which columns are the most important for prediction

In [None]:
! pip install ydata-profiling typing_extensions==4.7.1 sweetviz



In [None]:
import pandas as pd
import numpy as np

Data source: https://archive.ics.uci.edu/dataset/47/horse+colic

In [None]:
data_path = "data/horse-colic.data"
df = pd.read_csv(data_path, header=None, delim_whitespace=True)

In [None]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2,1,530101,38.50,66,28,3,3,?,2,...,45.00,8.40,?,?,2,2,11300,0,0,2
1,1,1,534817,39.2,88,20,?,?,4,1,...,50,85,2,2,3,2,2208,0,0,2
2,2,1,530334,38.30,40,24,1,1,3,1,...,33.00,6.70,?,?,1,2,0,0,0,1
3,1,9,5290409,39.10,164,84,4,1,6,2,...,48.00,7.20,3,5.30,2,1,2208,0,0,1
4,2,1,530255,37.30,104,35,?,?,6,2,...,74.00,7.40,?,?,2,2,4300,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1,1,533886,?,120,70,4,?,4,2,...,55,65,?,?,3,2,3205,0,0,2
296,2,1,527702,37.20,72,24,3,2,4,2,...,44.00,?,3,3.30,3,1,2208,0,0,1
297,1,1,529386,37.50,72,30,4,3,4,1,...,60.00,6.80,?,?,2,1,3205,0,0,2
298,1,1,530612,36.50,100,24,3,3,3,1,...,50.00,6.00,3,3.40,1,1,2208,0,0,1


In [None]:
# ? are replaced by nans
df = df.replace('?', np.nan)

In [None]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2,1,530101,38.50,66,28,3,3,,2,...,45.00,8.40,,,2,2,11300,0,0,2
1,1,1,534817,39.2,88,20,,,4,1,...,50,85,2,2,3,2,2208,0,0,2
2,2,1,530334,38.30,40,24,1,1,3,1,...,33.00,6.70,,,1,2,0,0,0,1
3,1,9,5290409,39.10,164,84,4,1,6,2,...,48.00,7.20,3,5.30,2,1,2208,0,0,1
4,2,1,530255,37.30,104,35,,,6,2,...,74.00,7.40,,,2,2,4300,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1,1,533886,,120,70,4,,4,2,...,55,65,,,3,2,3205,0,0,2
296,2,1,527702,37.20,72,24,3,2,4,2,...,44.00,,3,3.30,3,1,2208,0,0,1
297,1,1,529386,37.50,72,30,4,3,4,1,...,60.00,6.80,,,2,1,3205,0,0,2
298,1,1,530612,36.50,100,24,3,3,3,1,...,50.00,6.00,3,3.40,1,1,2208,0,0,1


In [None]:
df.describe()

Unnamed: 0,1,2,23,24,25,26,27
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,1.64,1085889.0,1.363333,3657.88,90.226667,7.363333,1.67
std,2.173972,1529801.0,0.481763,5399.513513,649.569234,127.536674,0.470998
min,1.0,518476.0,1.0,0.0,0.0,0.0,1.0
25%,1.0,528904.0,1.0,2111.75,0.0,0.0,1.0
50%,1.0,530305.5,1.0,2673.5,0.0,0.0,2.0
75%,1.0,534727.5,2.0,3209.0,0.0,0.0,2.0
max,9.0,5305629.0,2.0,41110.0,7111.0,2209.0,2.0


# Open and read horse-colic.names and horse-colic.names.original files

# Rename columns

In [None]:
list_column_names = ["V" + str(i) for i in range(1, 29)]
df.columns = list_column_names

In [None]:
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
0,2,1,530101,38.50,66,28,3,3,,2,...,45.00,8.40,,,2,2,11300,0,0,2
1,1,1,534817,39.2,88,20,,,4,1,...,50,85,2,2,3,2,2208,0,0,2
2,2,1,530334,38.30,40,24,1,1,3,1,...,33.00,6.70,,,1,2,0,0,0,1
3,1,9,5290409,39.10,164,84,4,1,6,2,...,48.00,7.20,3,5.30,2,1,2208,0,0,1
4,2,1,530255,37.30,104,35,,,6,2,...,74.00,7.40,,,2,2,4300,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1,1,533886,,120,70,4,,4,2,...,55,65,,,3,2,3205,0,0,2
296,2,1,527702,37.20,72,24,3,2,4,2,...,44.00,,3,3.30,3,1,2208,0,0,1
297,1,1,529386,37.50,72,30,4,3,4,1,...,60.00,6.80,,,2,1,3205,0,0,2
298,1,1,530612,36.50,100,24,3,3,3,1,...,50.00,6.00,3,3.40,1,1,2208,0,0,1


# Create EDA reports

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport

# Generate the data profiling report
report = ProfileReport(df, title='My Data')
report.to_file("EDA_report/my_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]



Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
import sweetviz as sv
report = sv.analyze(df)
report.show_html('EDA_report/report.html') # Generates a HTML report

                                             |          | [  0%]   00:00 -> (? left)

Report EDA_report/report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Check steps

1. Communication with domain side about data more times
2. Gain domain knowledge
3. Communication with engineers who collected the data
4. Understand the structure of data
5. Understand the quality of data
6. Understand whether the data is suitable for prediction
7. Which columns are the most important for prediction

In [None]:
df["V2"].value_counts()

1    276
9     24
Name: V2, dtype: int64