In [None]:
# Import modules for data wrangling
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pandas_profiling import ProfileReport
from datetime import datetime

In [None]:
# Read in .csv data to be cleaned
# Change global setting so that all columns will be displayed in dataframes and analyses
data = pd.read_csv('..\data\external\TTS_LBNL_public_file_07-Sep-2022_all.csv')
pd.set_option('display.max_columns', None)
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
# I only want to analyze residential installations, so I need to check what the values in 'customer_segment' are that represent
# residential vs commercial
print(data['customer_segment'].unique())

In [None]:
# Now I'll subset the data to just residential installations
res_data = data[data['customer_segment'] == 'RES']
res_data.shape

In [None]:
# I'm only interested in the most recent year of data - 2021 - so I'll first check to make sure that there are enough 
# installations to still create a valid model if I pare the data down to only 2021
newest_year = res_data['installation_date'].str[7:11] == '2021'
res_data[newest_year].shape

In [None]:
# Great! I still have >200,000 installations to analyze which should be plenty.
# Let's now change the 'installation_date' column to datetime
newest_res_data = res_data[newest_year].copy()
newest_res_data.head()

In [None]:
# Now I'll convert the 'installation_date' column to datetime
newest_res_data['installation_date'] = pd.to_datetime(newest_res_data['installation_date'])
newest_res_data.dtypes

In [None]:
# Rather than have the model attempt to process each day of the year, it makes more sense to break the date into bins by 
# month
newest_res_data['installation_month'] = newest_res_data['installation_date'].dt.month
newest_res_data['installation_month'].value_counts()

In [None]:
# Now I'll confirm that the monetary columns are correctly detected as floats
newest_res_data[['total_installed_price', 'rebate_or_grant']].dtypes

In [None]:
# To perform some preliminary checks I'll take a look at the .describe() results for the pared down dataset
newest_res_data.describe(include='all', datetime_is_numeric=True)

In [None]:
# I will need to calculate the final price by subtracting 'rebate_or_grant' from 'total_installed_price', so any rows with no
# data for 'total_installed_price' should be removed
print(newest_res_data['total_installed_price'].isna().sum())

In [None]:
print(np.sort(newest_res_data['total_installed_price'].unique()))

In [None]:
# Now I'll plot a box plogt of 'total_installed_price' to look for outliers and to determine if there is 
# a threshold below which we can safely say the values are erroneous or placeholders
plt.boxplot(newest_res_data['total_installed_price'])
plt.show

In [None]:
# There are a lot of potential outliers shown in this box plot, so let's take a look at a log-scaled histogram
plt.hist(newest_res_data['total_installed_price'], bins=[-1,100,10000,20000,30000,40000,50000,100000,200000])
plt.show()

In [None]:
# The histogram above is concerning because it looks like there's more than 10,000 entries below $1,000 which is likely lower
# than a real solar panel installation. Let's dig a little deeper, as -1 values are likely entered in place of missing data
plt.hist(newest_res_data['total_installed_price'], bins=[-1, 0, 1, 10])
plt.show()

In [None]:
# Ok, so it turns out there were almost 25,000 entries of -1 that we didn't even see in the previous histogram. 
# Now I want to know if there are any obvious correlations that might explain the high number of missing entries
# First I'll make a new dataframe with just rows that have -1 for 'total_installed_price'
neg_price = newest_res_data[newest_res_data['total_installed_price'] == -1]
# Now I'll check data provider to see if there's a specific culprit who is missing installation price
print(neg_price['data_provider_1'].value_counts())

In [None]:
# There are definitely a few primary providers that are missing a lot of installation price information.
# I want to see how this is divided by state next and identify if this is going to have a major impact on the number of results
# we have from any specific states
print(neg_price['state'].value_counts()/newest_res_data['state'].value_counts() * 100)

In [None]:
# I can go ahead and create the final price column that will be my metric for the model
newest_res_data['final_price'] = newest_res_data['total_installed_price'] - newest_res_data['rebate_or_grant']

In [None]:
# res_data.profile_report()