# **EDA for the client.csv dataset**

## 0 - Setting up 

### 0.1 - Libraries

In [None]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

import matplotlib.pyplot as plt
from IPython.display import display, IFrame


from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)


# Set the style of seaborn
sns.set(style="whitegrid")

## 1 - Loading the data

In [None]:
# Reading my csv file
client_df = pd.read_csv('../data/client.csv')

## 2 - Client data

### 2.1 - Data documentation

- `product_type`
- `county` - An ID code for the county. See county_id_to_name_map.json for the mapping of ID codes to county names.
- `eic_count` - The aggregated number of consumption points (EICs - European Identifier Code).
- `installed_capacity` - Installed photovoltaic solar panel capacity in kilowatts.
- `is_business` - Boolean for whether or not the prosumer is a business.
- `date`
- `data_block_id`

In [None]:
client_df.head()

In [None]:
# Converting the date to datetime
client_df['date'] = pd.to_datetime(client_df['date'], format='%Y-%m-%d')

### 2.2 - Checking for missing values and duplicates

In [None]:
client_df.info(show_counts = True)

In [None]:
# Check for missing values in each column
client_df.isnull().sum()

From the output, we can see that there are no missing values in the dataset.

In [None]:
# Check for duplicate rows
client_df.duplicated().sum()

Now, the data is cleaned from missing values and duplicates.

### 2.3 - Data distribution

In [None]:
# Create a histogram for each column to see the distribution of values
client_df.hist(bins=30, figsize=(20,15))
plt.show()
# This will create a histogram for each column in the dataframe and save it as 'output/histograms-87f83f.png'.

### 2.4 - Correlations

In [None]:
# Create a correlation matrix to see the relationship between variables
corr_matrix = client_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()
# This will create a heatmap of the correlation matrix and save it as 'output/correlation_matrix-87f83f.png'.


In [None]:
# Relationship between 'eic_count' and 'installed_capacity'

plt.figure(figsize=(10,8))
sns.scatterplot(x='eic_count', y='installed_capacity', hue='is_business', data=client_df)
plt.show()

In [None]:
# Relationship between 'eic_count' and 'installed_capacity' for business with different contracts

cond = client_df['is_business'] == 1

plt.figure(figsize=(10,8))
sns.scatterplot(x='eic_count', y='installed_capacity', hue='product_type', data=client_df[cond])
plt.show()

In [None]:
# Relationship between 'eic_count' and 'installed_capacity' for private with different contracts

cond = client_df['is_business'] == 0

plt.figure(figsize=(10,8))
sns.scatterplot(x='eic_count', y='installed_capacity', hue='product_type', data=client_df[cond])
plt.show()

In [None]:
# Filter data where 'county' is equal to 9, 'is_business' is equal to 0, and 'product_type' is equal to 1
client_df_filtered_0 = client_df[(client_df['county'] == 9) & (client_df['is_business'] == 0) & (client_df['product_type'] == 1)]

# Filter data where 'county' is equal to 9, 'is_business' is equal to 1, and 'product_type' is equal to 1
client_df_filtered_1 = client_df[(client_df['county'] == 9) & (client_df['is_business'] == 1) & (client_df['product_type'] == 1)]

# Plotting
plt.figure(figsize=(10, 6))

# Plot for is_business=0
plt.plot(client_df_filtered_0['date'], client_df_filtered_0['eic_count'], label='EIC Count (is_business=0)', color='blue')
plt.plot(client_df_filtered_0['date'], client_df_filtered_0['installed_capacity'], label='Installed Capacity (is_business=0)', color='orange')

# Plot for is_business=1
plt.plot(client_df_filtered_1['date'], client_df_filtered_1['eic_count'], label='EIC Count (is_business=1)', color='green')
plt.plot(client_df_filtered_1['date'], client_df_filtered_1['installed_capacity'], label='Installed Capacity (is_business=1)', color='red')

# Adding labels and title
plt.xlabel('Date')
plt.ylabel('Count/Capacity')
plt.title('EIC Count and Installed Capacity Over Time (County 9, product_type=1)')
plt.legend()

# Show the plot
plt.show()

In [None]:
# Selecting specified columns
selected_columns = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'data_block_id']
df_selected = df[selected_columns]

# Creating pair plot
sns.pairplot(client_df_selected, hue='is_business', markers=['o', 's'], palette={0: 'blue', 1: 'orange'})

# Show the plot
plt.show()

In [None]:
df['capacity_per_eic'] = df['installed_capacity'] / df['eic_count']
df

In [None]:
plt.hist(df['capacity_per_eic'], bins=20, color='blue', edgecolor='black')
plt.title('Histogram of capacity_per_eic')
plt.xlabel('Values in capacity_per_eic')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.scatter(df['eic_count'], df['capacity_per_eic'], color='green', alpha=0.5)
plt.title('Scatter Plot of capacity_per_eic vs. eic_count')
plt.xlabel('eic_count')
plt.ylabel('capacity_per_eic')
plt.show()