# Enexis Energy - Data Science case study - Bird's eye view of the data

In [18]:
%pip install pandas matplotlib

Note: you may need to restart the kernel to use updated packages.


### Importing packages

In [20]:
import pandas as pd
import sys
import matplotlib.pyplot as plt
import numpy as np      # Numeric calculations
import jinja2 as jinja2

### Functions

In [21]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)
    
    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len-1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len-1]

    # Return result.
    return dummy

# Function to impute median value in missing numerical data
def f_impute_numerical_values(df_input):
    df_imputed_num = df_input.replace(np.nan, df_input.median())
    return df_imputed_num


# Function to impute "Unknown" value in missing categorical data
def f_impute_categorical_values(df_input):
    c_replace_by = "Unknown"
    # Replace using fillna():
    df_imputed_cat = df_input.fillna(c_replace_by)
    return df_imputed_cat

In [22]:
def f_info(df_input, c_feature, n_top =  10):

    # Testing!
    # df_input  = df_historyPerYear
    # c_feature = "year"
    # n_top     = 3

    # Do not calculate the frequency table in case the feature has unique values.
    if(df_input[c_feature].is_unique):
        print("Feature '" + c_feature + "' is unique.")
        return


    # Load package.
    from collections import Counter

    # Bereken frequenties.
    c = Counter(df_input[c_feature])

    # Converteer naar data frame.
    df_output         = pd.DataFrame(list(c.items()))

    # Hernoem kolomnamen.
    df_output.columns = ["level", "n"]

    # Bereken percentage.
    df_output["perc"] = round(100 * df_output["n"] / df_input.shape[0], 1).astype(str) + "%"

    # Sorteer data frame op frequentie.
    df_output         = df_output.sort_values(by = "n", ascending = False)

    # Reset index en verwijder index kolom die daardoor ontstaat.
    df_output         = df_output.reset_index().drop(columns=['index'])

    # Display tabel zonder index.
    # https://stackoverflow.com/questions/61363712/how-to-print-a-pandas-io-formats-style-styler-object
    if(df_output.shape[0] <= n_top):
        c.message = "we tonen alle " + str(df_output.shape[0]) + " levels:"
        
    else:
        c.message = "we tonen de Top-" + str(n_top) + " van de " + str(df_output.shape[0]) + " levels:"

        
    # Print header
    print("Frequentietabel voor '" + c_feature + "', " + c.message + "\n")
        
    print(f"'None': {c[None]} ({round(100 * c[None] / df_input.shape[0], 1)}%)")

    print(f"'NA':   {df_input[c_feature].isna().sum()} ({round(100 * df_input[c_feature].isna().sum() / df_input.shape[0], 1)}%)")
            
    display(df_output.head(n_top).style.hide_index())
    
    plt.hist(df["year"], bins=10, orientation='horizontal') #plt.show()

    # Plot frequency if count is smaller than 20.
    v_data_to_plot = df[c_feature].value_counts(sort = True)[0:n_top]
    
    # https://stackoverflow.com/questions/36367986/how-to-make-inline-plots-in-jupyter-notebook-larger
    plt.rcParams['figure.figsize'] = [15, 5]
    
    fig, ax = plt.subplots()
    v_data_to_plot.plot(kind='bar', ax=ax)  
    plt.show()
    

In [23]:


# Load local data - How to do it
df_orig = pd.read_csv(
    "../../src/data/github-data/Enexis_decentrale_opwek_kv_(zon_pv)_01012020.csv"
, delimiter=';')

# df_orig = pd.read_csv(
#    "/Users/sgawde/work/eaisi-code/enexis-code-repo/ENEXIS/application-project-enexis/data/Enexis_opwekdata_gv_wind_en_zon_11062021.csv"
# , delimiter=';')

# df_orig = pd.read_csv(
#    "/Users/sgawde/work/eaisi-code/enexis-code-repo/ENEXIS/application-project-enexis/data/Enexis_opwekdata_gv_wind_en_zon_11062021.csv"
# , delimiter=';')

df_orig_num    = df_orig.select_dtypes(include='number')
l_df_num_names = df_orig_num.columns.tolist()

print(l_df_num_names)
# Get the first row as a Series
first_row = df_orig.iloc[0]
print(first_row)
print(f"\nNumber of numerical variables: {len(l_df_num_names)}")

df_orig_cat    = df_orig.select_dtypes(include='object')
l_df_cat_names = list(df_orig_cat.columns)

print(l_df_cat_names)
print(f"\nNumber of categorical variables: {len(l_df_cat_names)}")

# Pandas Series with type of each variable (variable, column) in df_orig.
ps_missing_type    = df_orig.dtypes

# Number of missing data per variable.
ps_missing_total   = df_orig.isnull().sum()

# Percentage of missing per variable.
ps_missing_percent = round(100 * ps_missing_total / df_orig.shape[0], 1)


# Create table (Pandas DataFrame).
df_missing_data = pd.DataFrame({

    'data_type':   ps_missing_type,
    'empty_total': ps_missing_total,
    'empty_perc':  ps_missing_percent
})

# Sort table by number of missing data in descending order.
df_missing_data.sort_values(
    by        = 'empty_total',
    ascending = False,
    inplace   = True
)

# Remove variables that have no missing values.
df_missing_data = df_missing_data.query("empty_total > 0")

# Show table.
print(
    f"Number of variables having missing data: "
    f"{df_missing_data.shape[0]} (out of {df_orig.shape[1]})"
)


# df_orig_num.info()
# df_orig_cat.info()
# print(df_orig.columns)

df_imputed_num = df_orig_num.replace(np.nan, df_orig_num.median())


c_replace_by = "Unknown"
df_imputed_cat = df_orig_cat.fillna(c_replace_by)

df_imputed = pd.concat([df_imputed_cat, df_imputed_num], axis=1)


# Show table.
print(
    f"Number of variables having missing data in df_imputed: "
    f"{df_imputed.shape[0]} (out of {df_imputed.shape[1]})"
)


# Imput missing values in 'Netbeheerder' (if applicable)
# Imput missing values in 'Provincie' (if applicable)
# Imput missing values in 'Gemeente' (if applicable)
# df_impute_cat = f_impute_categorical_values(df_orig_cat)

# Imput missing values in 'Peildatum' (if applicable)
# Imput missing values in 'CBS Buurtcode' (if applicable)
# Imput missing values in 'Aantal aansluitingen in CBS-buurt ' (if applicable)
# Imput missing values in 'Aantal aansluitingen met opwekinstallatie' (if applicable)
# Imput missing values in 'Opgesteld vermogen' (if applicable)


# Imput missing values in 'Gemeente' (if applicable)
# df_orig['Gemeente'] = df_orig['Gemeente'].fillna('Unknown')



# Imput missing values in 'Gemeente' (if applicable)
print("Median values in original numerical data (first five variables):")
print(df_orig_num.median().head(5))



# Bar chart for numerical variable grouped by categories
# plt.bar(df_orig['Gemeente'], df_orig['Aantal aansluitingen met opwekinstallatie'])
# plt.xlabel('Gemeente')
# plt.ylabel('Value')
# plt.title('Bar Chart for Numerical Variable')
# plt.show()



# f_info(df_orig, 'Peildatum')
#f_info(df_orig.Provincie, n_top=5, b_show_plot=True)
#f_info(df_orig['Gemeente'])

# df = df_orig
# df['transactions'].value_counts(sort = True)[0:10]

['Peildatum', 'CBS Buurtcode', 'Aantal aansluitingen in CBS-buurt ', 'Aantal aansluitingen met opwekinstallatie', 'Opgesteld vermogen']
Peildatum                                              202001.0
Netbeheerder                                             Enexis
Provincie                                             Groningen
Gemeente                                             Appingedam
CBS Buurt                                    Appingedam-Centrum
CBS Buurtcode                                           30000.0
Aantal aansluitingen in CBS-buurt                          1.48
Aantal aansluitingen met opwekinstallatie                 155.0
Opgesteld vermogen                                        507.0
Name: 0, dtype: object

Number of numerical variables: 5
['Netbeheerder', 'Provincie', 'Gemeente', 'CBS Buurt ']

Number of categorical variables: 4
Number of variables having missing data: 9 (out of 9)
Number of variables having missing data in df_imputed: 4647 (out of 9)
Median values 

In [24]:
#f_info(df, "month", 15)

In [25]:
def f_describe(df_input, n_top = 10):
    
    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))
     
    
    df_numeric = df_input.select_dtypes(include = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64'])

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

        
    df_textual = df_input.select_dtypes(include = ['category', 'object', 'bool'])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())
        
        
    v_na = [col + " (" + str(df[col].isna().sum()) + ", " + str(round(100 * df[col].isna().sum() / df.shape[0], 1)) + "%)" for col in df.columns if df[col].isna().sum() > 0]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))


In [26]:
def f_get_data(i=0):

    # Define path.
    c_path = "../../src/data/github-data/"

    # Identify file.
    v_file = ("Enexis_kleinverbruiksgegevens_01012010",                                          # 0
              "Enexis_kleinverbruiksgegevens_01012020",                                          # 1
              "Enexis_decentrale_opwek_kv_(zon_pv)_01012020",                                    # 2
              "Open_Asset_Data_Elektra_CSV/IMKL-Appurtenance_E_Lv_Charging_Point_Noord_ligging", # 3
              "CBS_PC6_2016_v2")                                                                 # 4
    
    
    # Assign decimal separator.
    if(i in [0, 1]):
        c_decimal = ","
        
    else:
        c_decimal = "."
        
        
    # Load data. Used encoding= 'unicode_escape' to read 'kvb 01012010'.
    df     = pd.read_csv(filepath_or_buffer = c_path + v_file[i] + ".csv",
                         sep                = ';',
                         decimal            = c_decimal,
                         encoding           = 'unicode_escape')
    
    print(c_path + v_file[i] + ".csv")
    print(c_decimal)
    
    # Return data.
    return df

### Importing data
Import the data by selecting the index of the file in v_file in f_get_data.

In [27]:
df = f_get_data(2)

../../src/data/github-data/Enexis_decentrale_opwek_kv_(zon_pv)_01012020.csv
.


### Downcast data as needed

In [28]:
# A few basic statistics on the data frame.
print("The data:\n")
print(f"-> has size of {sys.getsizeof(df)/1024.0/1024.0:0.2f} MB.")
print("")

df.info()

The data:

-> has size of 1.14 MB.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4647 entries, 0 to 4646
Data columns (total 9 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ï»¿Peildatum                               3890 non-null   float64
 1   Netbeheerder                               3890 non-null   object 
 2   Provincie                                  3889 non-null   object 
 3   Gemeente                                   3888 non-null   object 
 4   CBS Buurt                                  3757 non-null   object 
 5   CBS Buurtcode                              3729 non-null   float64
 6   Aantal aansluitingen in CBS-buurt          3890 non-null   float64
 7   Aantal aansluitingen met opwekinstallatie  3890 non-null   float64
 8   Opgesteld vermogen                         3890 non-null   float64
dtypes: float64(5), object(4)
memory usage: 326.9+ KB


In [29]:
# objects to categorical - Not applicable here because there are no 'object' features
df[df.select_dtypes(include='object').columns] = df.select_dtypes(include='object').astype('category')

# convert integers to smallest unsigned integer and floats to smallest
for old, new in [('integer', 'unsigned'), ('float', 'float')]:
    
    print("\nchange: " + old + " --> " + new)
    
    for col in df.select_dtypes(include=old).columns:
        
        print(col)
        
        df[col] = pd.to_numeric(df[col], downcast=new)


change: integer --> unsigned

change: float --> float
ï»¿Peildatum
CBS Buurtcode
Aantal aansluitingen in CBS-buurt 
Aantal aansluitingen met opwekinstallatie
Opgesteld vermogen


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4647 entries, 0 to 4646
Data columns (total 9 columns):
 #   Column                                     Non-Null Count  Dtype   
---  ------                                     --------------  -----   
 0   ï»¿Peildatum                               3890 non-null   float32 
 1   Netbeheerder                               3890 non-null   category
 2   Provincie                                  3889 non-null   category
 3   Gemeente                                   3888 non-null   category
 4   CBS Buurt                                  3757 non-null   category
 5   CBS Buurtcode                              3729 non-null   float64 
 6   Aantal aansluitingen in CBS-buurt          3890 non-null   float32 
 7   Aantal aansluitingen met opwekinstallatie  3890 non-null   float32 
 8   Opgesteld vermogen                         3890 non-null   float32 
dtypes: category(4), float32(4), float64(1)
memory usage: 299.2 KB


### Some Statistics:

In [31]:
print("The data:\n")
print(f"-> contain {df.shape[0]/1e6:0.1f} million observations and {df.shape[1]} features.\n")
print(f"-> contain {df.shape[0]} observations and {df.shape[1]} features.\n")
print(f"-> have feature names: {f_concat(df.columns)}.\n")
print(f"-> has size of {sys.getsizeof(df)/1024.0/1024.0:0.2f} MB.")

The data:

-> contain 0.0 million observations and 9 features.

-> contain 4647 observations and 9 features.

-> have feature names: ï»¿Peildatum, Netbeheerder, Provincie, Gemeente, CBS Buurt , CBS Buurtcode, Aantal aansluitingen in CBS-buurt , Aantal aansluitingen met opwekinstallatie, and Opgesteld vermogen.

-> has size of 0.50 MB.


In [34]:
f_describe(df)
print(pd.__version__)

First 10 rows in de data:


Unnamed: 0,ï»¿Peildatum,Netbeheerder,Provincie,Gemeente,CBS Buurt,CBS Buurtcode,Aantal aansluitingen in CBS-buurt,Aantal aansluitingen met opwekinstallatie,Opgesteld vermogen
0,202001.0,Enexis,Groningen,Appingedam,Appingedam-Centrum,30000.0,1.48,155.0,507.0
1,202001.0,Enexis,Groningen,Appingedam,Appingedam-West,30001.0,1.331,479.0,1.685
2,202001.0,Enexis,Groningen,Appingedam,Appingedam-Oost,30002.0,2.826,439.0,1.616
3,202001.0,Enexis,Groningen,Appingedam,Verspreide huizen Damsterdiep en Eemskanaal,30007.0,230.0,97.0,405.0
4,202001.0,Enexis,Groningen,Appingedam,Verspreide huizen ten zuiden van Eemskanaal,30008.0,102.0,21.0,61.0
5,202001.0,Enexis,Groningen,Appingedam,Verspreide huizen ten noorden van het Damsterdiep,30009.0,137.0,31.0,166.0
6,202001.0,Enexis,Groningen,Groningen,Ten Boer,90000.0,1.732,534.0,1.656
7,202001.0,Enexis,Groningen,Groningen,Sint-Annen,90003.0,58.0,24.0,93.0
8,202001.0,Enexis,Groningen,Groningen,Verspreide huizen ten noorden van het Eemskanaal,90009.0,488.0,196.0,1.109
9,202001.0,Enexis,Groningen,Groningen,Ten Post,90100.0,266.0,107.0,585.0


Numerical data:


Unnamed: 0,ï»¿Peildatum,CBS Buurtcode,Aantal aansluitingen in CBS-buurt,Aantal aansluitingen met opwekinstallatie,Opgesteld vermogen
count,3890.0,3729.0,3890.0,3890.0,3890.0
mean,202001.0,9669277.0,267.369598,101.218048,293.752869
std,0.0,6547942.0,268.376007,115.934532,232.853012
min,202001.0,30000.0,1.002,1.007,1.006
25%,202001.0,1890107.0,45.25,27.0,116.0
50%,202001.0,8551707.0,183.0,59.0,236.5
75%,202001.0,16900600.0,419.0,130.0,432.0
max,202001.0,19870310.0,999.0,958.0,998.0


Textual data:


Unnamed: 0,Netbeheerder,Provincie,Gemeente,CBS Buurt
count,3890,3889,3888,3757
unique,1,5,142,3632
top,Enexis,Noord-Brabant,Groningen,Centrum
freq,3890,1394,98,17


Features and their number of missing values:


'ï»¿Peildatum (757, 16.3%), Netbeheerder (757, 16.3%), Provincie (758, 16.3%), Gemeente (759, 16.3%), CBS Buurt  (890, 19.2%), CBS Buurtcode (918, 19.8%), Aantal aansluitingen in CBS-buurt  (757, 16.3%), Aantal aansluitingen met opwekinstallatie (757, 16.3%), and Opgesteld vermogen (757, 16.3%)'

2.2.3


In [33]:
f_info(df, "Netbeheerder")

Frequentietabel voor 'Netbeheerder', we tonen alle 2 levels:

'None': 0 (0.0%)
'NA':   757 (16.3%)


AttributeError: 'Styler' object has no attribute 'hide_index'