In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
import plotly.express as px

In [13]:
data = pd.read_csv('../data/data_5.csv')
df.head()

Unnamed: 0,structure_max_elevation,footprint_max_elevation,structure_min_elevation,property_id,footprint_extrusion,footprint_min_elevation,structure_extrusion,roof_type
0,30.5,30.5,17.5,109953,13.5,17.5,13.0,Flat
1,32.5,26.5,18.0,105685,9.0,18.0,8.5,Flat
2,32.5,32.5,18.0,105685,5.5,26.5,14.5,Hip
3,20.5,19.5,17.0,104550,2.0,17.0,2.5,Flat
4,24.5,23.5,17.0,104870,6.0,17.0,6.5,Flat


In [17]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_counts = train_data.groupby('roof_type').size()
test_counts = test_data.groupby('roof_type').size()

print(
    f'Train data: {train_counts}\n\nTest data: {test_counts}'
)

Train data: roof_type
Flat       1673
Gable       415
Hip         298
Pyramid      10
Shed          4
dtype: int64

Test data: roof_type
Flat       400
Gable      115
Hip         83
Pyramid      2
dtype: int64


In [19]:
unique_property_ids = data['property_id'].unique()
test_property_ids = unique_property_ids[:5]

test_data_leave_out = data[data['property_id'].isin(test_property_ids)]
train_data_leave_out = data[~data['property_id'].isin(test_property_ids)]

train_data_leave_out_sample = train_data_leave_out.groupby('roof_type').size()
test_data_leave_out_sample = test_data_leave_out.groupby('roof_type').size()
print(
    f'Train data leave out: {train_data_leave_out_sample}\n\nTest data leave out: {test_data_leave_out_sample}'
)

Train data leave out: roof_type
Flat       2067
Gable       530
Hip         379
Pyramid      12
Shed          4
dtype: int64

Test data leave out: roof_type
Flat    6
Hip     2
dtype: int64


In [27]:
df = pd.DataFrame(
    data = {
        'Train': train_counts,
        'Test': test_counts,
        'Train Leave Out': train_data_leave_out_sample,
        'Test Leave Out': test_data_leave_out_sample
    }
).reset_index().fillna(0)

In [32]:
df.melt(id_vars='roof_type')

Unnamed: 0,roof_type,variable,value
0,Flat,Train,1673.0
1,Gable,Train,415.0
2,Hip,Train,298.0
3,Pyramid,Train,10.0
4,Shed,Train,4.0
5,Flat,Test,400.0
6,Gable,Test,115.0
7,Hip,Test,83.0
8,Pyramid,Test,2.0
9,Shed,Test,0.0


In [34]:
px.bar(
    df.melt(id_vars='roof_type'),
    y='roof_type',
    x='value',
    color='variable',
    barmode='group',
    title='Roof Type Distribution'
).show()
