In [41]:
import pandas as pd

import numpy as np

# Load in Data

In [28]:
df = pd.read_csv("data/train_features.csv")
df.head()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner
0,2,3,2,1,76,1,0
1,1,3,2,0,70,4,3
2,0,3,1,0,70,4,2
3,0,2,2,1,65,1,0
4,2,3,2,0,80,3,1


In [29]:
df_target = pd.read_csv("data/train_target.csv")
df_target.head()

Unnamed: 0,verification_result
0,False
1,False
2,True
3,False
4,True


In [30]:
df_target['verification_result'] = df_target['verification_result'].astype(int)
df_target.head()

Unnamed: 0,verification_result
0,0
1,0
2,1
3,0
4,1


In [31]:
df["target"] = df_target["verification_result"]
df.head()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner,target
0,2,3,2,1,76,1,0,0
1,1,3,2,0,70,4,3,0
2,0,3,1,0,70,4,2,1
3,0,2,2,1,65,1,0,0
4,2,3,2,0,80,3,1,1


# Get basic information about the dataframe

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1430 entries, 0 to 1429
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   bidder1.capacity  1430 non-null   int64
 1   bidder2.capacity  1430 non-null   int64
 2   bidder3.capacity  1430 non-null   int64
 3   bidder4.capacity  1430 non-null   int64
 4   price             1430 non-null   int64
 5   product           1430 non-null   int64
 6   winner            1430 non-null   int64
 7   target            1430 non-null   int32
dtypes: int32(1), int64(7)
memory usage: 83.9 KB


In [33]:
df.isna().any()

bidder1.capacity    False
bidder2.capacity    False
bidder3.capacity    False
bidder4.capacity    False
price               False
product             False
winner              False
target              False
dtype: bool

In [34]:
df.describe()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner,target
count,1430.0,1430.0,1430.0,1430.0,1430.0,1430.0,1430.0,1430.0
mean,1.011888,2.1,1.878322,0.585315,71.424476,3.255944,0.459441,0.127972
std,0.816553,0.814795,0.327028,0.49284,8.031487,1.810537,1.069701,0.334176
min,0.0,0.0,1.0,0.0,59.0,1.0,0.0,0.0
25%,0.0,1.0,2.0,0.0,65.0,2.0,0.0,0.0
50%,1.0,2.0,2.0,1.0,70.0,3.0,0.0,0.0
75%,2.0,3.0,2.0,1.0,78.0,5.0,0.0,0.0
max,2.0,3.0,2.0,1.0,90.0,6.0,4.0,1.0


In [35]:
df.corr()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner,target
bidder1.capacity,1.0,0.066579,0.461402,-0.052081,0.280293,0.074622,-0.119222,-0.123548
bidder2.capacity,0.066579,1.0,-0.048848,-0.088353,0.051682,-0.104645,-0.020634,-0.031612
bidder3.capacity,0.461402,-0.048848,1.0,-0.065802,0.198454,-0.011187,-0.102136,-0.081533
bidder4.capacity,-0.052081,-0.088353,-0.065802,1.0,0.097716,0.134714,0.121388,0.003771
price,0.280293,0.051682,0.198454,0.097716,1.0,-0.029325,0.22922,0.211017
product,0.074622,-0.104645,-0.011187,0.134714,-0.029325,1.0,0.046916,0.025633
winner,-0.119222,-0.020634,-0.102136,0.121388,0.22922,0.046916,1.0,0.22106
target,-0.123548,-0.031612,-0.081533,0.003771,0.211017,0.025633,0.22106,1.0


# Plots about the features

In [37]:
counts = df['winner'].value_counts()
fig = px.bar(x=counts.index, y=counts.values, labels={'x':'Winner', 'y':'Values'},
             title='Entries per Winner')

# Show plot 
fig.show()

In [38]:
counts = df['product'].value_counts()
fig = px.bar(x=counts.index, y=counts.values, labels={'x':'Products', 'y':'Values'},
             title='Entries per Product')

# Show plot 
fig.show()

In [9]:
fig = px.box(df, y="price")
fig.show()

In [40]:
counts = df.groupby(['winner', 'product']).size().reset_index(name='counts')
fig = px.bar(counts, x='winner', y='counts', color='product',
             title='Products per Winner', barmode='stack',
             labels={'winner':'Winner', 'counts':'Values', 'product':'Product'})
fig.show()

### Bin Prices

In [12]:
# Freedman-Diaconis Rule
# unfortunately it creates too many bins

IQR = 78 - 65  # Q3 - Q1

bin_width = 2 * IQR / (len(df["price"]) ** (1/3))

# Determine number of bins
num_bins = int((max(df["price"]) - min(df["price"])) / bin_width)

print(bin_width)
print(num_bins)

2.3077814003417765
13


In [13]:
# compute boundaries for 6 bins
bin_boundaries = np.linspace(min(df["price"]), max(df["price"]), 6)
bin_boundaries

array([59. , 65.2, 71.4, 77.6, 83.8, 90. ])

In [42]:
# apply bins
df["price_bin"] = np.digitize(df["price"], bin_boundaries)
df.head()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner,target,price_bin
0,2,3,2,1,76,1,0,0,3
1,1,3,2,0,70,4,3,0,2
2,0,3,1,0,70,4,2,1,2
3,0,2,2,1,65,1,0,0,1
4,2,3,2,0,80,3,1,1,4


In [43]:
counts = df.groupby(['winner', 'price_bin']).size().reset_index(name='counts')
fig = px.bar(counts, x='winner', y='counts', color='price_bin',
             title='Price Bins per Winner', barmode='stack')
fig.show()

In [44]:
counts = df['price_bin'].value_counts()
fig = px.bar(x=counts.index, y=counts.values, labels={'x':'Bins', 'y':'Values'},
             title='Distribution of Price Entries per Bin')

# Show plot 
fig.show()

## Target analysis

In [49]:
counts = df['target'].value_counts()
fig = px.bar(x=counts.index, y=counts.values, labels={'x':'Values', 'y':'Label'},
             title='Distribution of Target Variable')

# Show plot 
fig.show()