# **EDA for the target data - part 2**

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

sns.set() # set seaborn as default style

In [None]:
cons_df = pd.read_csv('../data/data_consumption.csv')
prod_df = pd.read_csv('../data/data_production.csv')
pred_id_df = pd.read_csv('../data/prediction_unit_id_dictionary.csv')

In [None]:
pred_id_df.head()

In [None]:
business_ids = pred_id_df[pred_id_df['is_business'] == 1]['prediction_unit_id'].tolist()
private_ids = pred_id_df[pred_id_df['is_business'] == 0]['prediction_unit_id'].tolist()

In [None]:
cons_df.head()

In [None]:
prod_df.head()

In [None]:
sns.scatterplot(prod_df, x = 'eic_count', y = 'installed_capacity', hue = 'is_business')
plt.show()

In [None]:
prod_df[prod_df['is_business']==1]['target'].corr(cons_df[cons_df['is_business']==1]['target'])

In [None]:
prod_df[prod_df['is_business']==1]['log1p_target'].corr(cons_df[cons_df['is_business']==1]['log1p_target'])

In [None]:
prod_df[prod_df['is_business']==0]['target'].corr(cons_df[cons_df['is_business']==0]['target'])

In [None]:
prod_df[prod_df['is_business']==0]['log1p_target'].corr(cons_df[cons_df['is_business']==0]['log1p_target'])

In [None]:
rel_cols = ['datetime', 'month', 'prediction_unit_id', 'county', 'is_business', 'product_type', 'eic_count']
target_prod_cons_df = prod_df[rel_cols + ['target', 'log1p_target']].merge(cons_df[rel_cols + ['target', 'log1p_target']], 
                                                                           how = 'inner', on = rel_cols, suffixes = ('_prod', '_cons'))

In [None]:
target_prod_cons_df.head()

In [None]:
target_prod_cons_df['target_cons_+_prod'] = target_prod_cons_df['target_cons'] + target_prod_cons_df['target_prod']

In [None]:
target_prod_cons_df['datetime'] = pd.to_datetime(target_prod_cons_df['datetime'])

In [None]:
target_prod_cons_df['target_cons_per_eic_count'] = target_prod_cons_df['target_cons']/target_prod_cons_df['eic_count']
target_prod_cons_df['target_prod_per_eic_count'] = target_prod_cons_df['target_prod']/target_prod_cons_df['eic_count']
target_prod_cons_df['target_cons_+_prod_per_eic_count'] = target_prod_cons_df['target_cons_+_prod']/target_prod_cons_df['eic_count']

In [None]:
sns.scatterplot(target_prod_cons_df, x = 'log1p_target_cons', y = 'log1p_target_prod')

In [None]:
for id_ in business_ids:
    print(id_, prod_df[prod_df['prediction_unit_id']==id_]['target'].corr(cons_df[cons_df['prediction_unit_id']==id_]['target']))

In [None]:
for id_ in private_ids:
    print(id_, prod_df[prod_df['prediction_unit_id']==id_]['target'].corr(cons_df[cons_df['prediction_unit_id']==id_]['target']))

In [None]:
fig, axes = plt.subplots(19, 2, figsize=(22,70), dpi=300)
gs = gridspec.GridSpec(19,2)
for i, id_ in enumerate(business_ids):
    cond = target_prod_cons_df['prediction_unit_id'] == id_
    sns.scatterplot(data = target_prod_cons_df[cond], x = 'target_prod', y = 'target_cons', hue = 'month', ax = axes[int(i/2),int(i%2)])
    ax.set_title(f'unit_{id_}')
    ax.set_xlabel('production')
    ax.set_ylabel('consumption')
    ax=axes[int(i/2),int(i%2)]
    plt.tight_layout()

In [None]:
cond = target_prod_cons_df['is_business'] == 1
g = sns.FacetGrid(target_prod_cons_df[cond], hue = 'month', col = 'prediction_unit_id', col_wrap = 2, 
                  sharex=False, sharey=False, aspect=2.2, height=3)
g.map_dataframe(sns.scatterplot, x="target_prod", y="target_cons", ax = axes)
axes = g.axes.flatten()
for i, ax in enumerate(axes):
    ax.axhline(0)

In [None]:
cond = target_prod_cons_df['is_business'] == 0
g = sns.FacetGrid(target_prod_cons_df[cond], hue = 'month', col = 'prediction_unit_id', col_wrap = 2, 
                  sharex=False, sharey=False, aspect=2.2, height=3)
g.map_dataframe(sns.scatterplot, x="target_prod", y="target_cons", ax = axes)
g.add_legend()

In [None]:
cond = target_prod_cons_df['is_business'] == 1
g = sns.FacetGrid(target_prod_cons_df[cond], col = 'prediction_unit_id', col_wrap = 2, 
                  sharex=False, sharey=False, aspect=2.2, height=3)
g.map_dataframe(sns.lineplot, x="datetime", y="target_cons_+_prod", ax = axes)
g.add_legend()

In [None]:
cond = target_prod_cons_df['is_business'] == 1
g = sns.FacetGrid(target_prod_cons_df[cond], col = 'prediction_unit_id', col_wrap = 2, 
                  sharex=False, sharey=False, aspect=2.2, height=3)
g.map_dataframe(sns.lineplot, x="datetime", y="target_cons_+_prod_per_eic_count", ax = axes)
g.add_legend()

In [None]:
sns.histplot(np.log1p(target_prod_cons_df[target_prod_cons_df['is_business']==1]['target_cons_+_prod_per_eic_count']))
sns.histplot(np.log1p(target_prod_cons_df[target_prod_cons_df['is_business']==1]['target_cons']))

In [None]:
sns.histplot(np.log1p(target_prod_cons_df[target_prod_cons_df['is_business']==0]['target_cons_+_prod_per_eic_count']))
sns.histplot(np.log1p(target_prod_cons_df[target_prod_cons_df['is_business']==0]['target_cons_per_eic_count']))

In [None]:
cond = target_prod_cons_df['is_business'] == 0
g = sns.FacetGrid(target_prod_cons_df[cond], col = 'prediction_unit_id', col_wrap = 2, 
                  sharex=False, sharey=False, aspect=2.2, height=3)
g.map_dataframe(sns.lineplot, x="datetime", y="target_per_eic_count", ax = axes)
g.add_legend()

In [None]:
sns.lineplot(prod_df[prod_df['prediction_unit_id'] == 10]['target'] + cons_df[cons_df['prediction_unit_id'] == 10]['target'])
sns.lineplot(prod_df[prod_df['prediction_unit_id'] == 10]['target'])
sns.lineplot(cons_df[cons_df['prediction_unit_id'] == 10]['target'])

In [None]:
sns.lineplot(cons_df[cons_df['prediction_unit_id'] == 10]['target'] - prod_df[prod_df['prediction_unit_id'] == 10]['target'])

In [None]:
for county in range(16):
    ax = sns.histplot(pred_id_df[(pred_id_df['county']==county)&(pred_id_df['is_business']==0)], x = 'product_type', discrete = True, shrink = 0.1)
    ax.set_xlim(-1,4)
    ax.set_ylim(0,2)
    ax.set_title(f'County {county}')
    plt.show()

In [None]:
sns.histplot(np.log1p(cons_df[cons_df['is_business']==1]['target_per_eic_count']))
sns.histplot(np.log1p(cons_df[cons_df['is_business']==0]['target_per_eic_count']))

In [None]:
prod_df[prod_df['is_business']==1]['target'].corr(cons_df[cons_df['is_business']==1]['target'])

In [None]:
for contract in range(4):
    aux_series_buss = np.log1p(cons_df[(cons_df['is_business']==1)&(cons_df['product_type']==contract)]['target_per_eic_count'])
    aux_series_hous = np.log1p(cons_df[(cons_df['is_business']==0)&(cons_df['product_type']==contract)]['target_per_eic_count'])
    ax1 = sns.histplot(aux_series_buss, label = f'Contract {contract}', kde = True)
    ax2 = sns.histplot(aux_series_hous, label = f'Contract {contract}', kde = True)
    ax1.set_xlim(0,6)
    ax2.set_xlim(0,6)
    ax1.set_ylim(0,5000)
    ax2.set_ylim(0,5000)
    plt.show()
    print('skweness:', aux_series.skew())
    print('median:', aux_series.median())

In [None]:
aux_series_buss02 = np.log1p(cons_df[(cons_df['is_business']==1)&(cons_df['product_type'].isin([0,2]))]['target_per_eic_count'])
aux_series_buss13 = np.log1p(cons_df[(cons_df['is_business']==1)&(cons_df['product_type'].isin([1,3]))]['target_per_eic_count'])
aux_series_hous = np.log1p(cons_df[(cons_df['is_business']==0)&(cons_df['product_type']==contract)]['target_per_eic_count'])
ax02 = sns.histplot(aux_series_buss02, label = f'Contract {contract}', kde = True)
ax13 = sns.histplot(aux_series_buss13, label = f'Contract {contract}', kde = True)
ax02.set_xlim(0,6)
ax13.set_xlim(0,6)
ax02.set_ylim(0,7000)
ax13.set_ylim(0,7000)
plt.show()

In [None]:
for county in range(16):
    ax = sns.histplot(cons_df[(cons_df['is_business']==1)&(cons_df['county']==county)], x = 'log1p_target', hue = 'product_type')
    ax.set_xlim(0,11) 
    ax.set_ylim(0, 6000)
    ax.set_title(f'County {county}')
    plt.show()

In [None]:
for county in range(16):
    ax = sns.histplot(cons_df[(cons_df['is_business']==1)&(cons_df['county']==county)], x = 'log1p_target_per_eic_count', hue = 'product_type')
    ax.set_xlim(0,5) 
    ax.set_ylim(0, 2500)
    ax.set_title(f'County {county}')
    plt.show()

In [None]:
for county in range(16):
    ax = sns.histplot(np.log1p(cons_df[(cons_df['is_business']==1)&(cons_df['county']==county)]['target_per_eic_count']))
    ax.set_xlim(0,11) 
    ax.set_ylim(0, 5000)
    ax.set_title(f'County {county}')
    plt.show()

In [None]:
client_df = pd.read_csv('../data/client.csv')
client_df['date'] = pd.to_datetime(client_df['date'])
client_df.head()

In [None]:
for contract in range()
sns.lineplot(client_df[(client_df['is_business']==1)].groupby('date')['eic_count'].sum())

In [None]:
for id_ in pred_id_df[pred_id_df['is_business'] == 1]['prediction_unit_id']:
    plt.title(f'unit_{id_}')
    ax = sns.histplot(np.log1p(cons_df[cons_df['prediction_unit_id'] == id_]['target']))
    ax.set_xlim(0,10)
    plt.show()

In [None]:
sns.histplot(np.log1p(cons_df[(cons_df['is_business']==1)&(cons_df['prediction_unit_id'].isin([5,10,48]))]['target']))
sns.histplot(np.log1p(cons_df[(cons_df['is_business']==1)&(~cons_df['prediction_unit_id'].isin([5, 10,48]))]['target']))

In [None]:
sns.histplot(np.log1p(cons_df[(cons_df['is_business']==1)&(cons_df['county'].isin([0,2,11]))]['target']))
sns.histplot(np.log1p(cons_df[(cons_df['is_business']==1)&(~cons_df['county'].isin([0,2,11]))]['target']))

In [None]:
np.log1p(cons_df[(cons_df['is_business']==1)&(cons_df['prediction_unit_id'].isin([5,10,48]))]['target']).median()

In [None]:
sns.histplot(np.log1p(cons_df[(cons_df['is_business']==1)&(cons_df['prediction_unit_id'].isin([5,48]))]['target_per_eic_count']))
sns.histplot(np.log1p(cons_df[(cons_df['is_business']==1)&(~cons_df['prediction_unit_id'].isin([5,48]))]['target_per_eic_count']))

In [None]:
for id_ in pred_id_df[pred_id_df['is_business'] == 1]['prediction_unit_id']:
    plt.title(f'unit_{id_}')
    ax = sns.histplot(np.log1p(cons_df[cons_df['prediction_unit_id'] == id_]['target_per_eic_count']))
    ax.set_xlim(0,6)
    plt.show()

In [None]:
plt.figure(figsize=(22,10),dpi=300)
sns.scatterplot(cons_df, x = 'eic_count', y = 'target', hue = 'is_business')
plt.show()

In [None]:
plt.figure(figsize=(22,10),dpi=300)
sns.scatterplot(cons_df[(cons_df['is_business']==1)&(cons_df['county'].isin([0,2,11]))], x = 'eic_count', y = 'target', hue = 'county')
plt.show()

In [None]:
pd.unique(cons_df['county'])

In [None]:
prod_df = pd.read_csv('../data/data_production.csv')

In [None]:
plt.figure(figsize=(22,10),dpi=300)
sns.scatterplot(cons_df['target'] - prod_df['target'])
plt.show()

In [None]:
plt.figure(figsize=(22,10),dpi=300)
sns.histplot(np.log1p(cons_df['target'] - prod_df['target']))
plt.show()

In [None]:
plt.figure(figsize=(22,10),dpi=300)
sns.scatterplot(cons_df['target'] + prod_df['target'])
plt.show()

In [None]:
plt.figure(figsize=(22,10),dpi=300)
sns.histplot(np.log1p(cons_df['target'] + prod_df['target']))
plt.show()