### Imports

In [2]:
import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [3]:
# folder path
data_folder = 'SolarPanelSoilingImageDataset/Solar_Panel_Soiling_Image_dataset/PanelImages/'

In [4]:
files = os.listdir(data_folder)

## Structure of the file name

solar_day_Month_date_hour__minute__second__year_L_%ageloss_I_irradiancelevel.jpg


In [46]:
files

['solar_Fri_Jun_16_10__0__11_2017_L_0.906153208302_I_0.321592156863.jpg',
 'solar_Fri_Jun_16_10__0__16_2017_L_0.903081697073_I_0.293192156863.jpg',
 'solar_Fri_Jun_16_10__0__1_2017_L_0.916698044034_I_0.39577254902.jpg',
 'solar_Fri_Jun_16_10__0__21_2017_L_0.903081697073_I_0.293192156863.jpg',
 'solar_Fri_Jun_16_10__0__26_2017_L_0.896087391118_I_0.27462745098.jpg',
 'solar_Fri_Jun_16_10__0__31_2017_L_0.896087391118_I_0.27462745098.jpg',
 'solar_Fri_Jun_16_10__0__36_2017_L_0.894974574172_I_0.269141176471.jpg',
 'solar_Fri_Jun_16_10__0__41_2017_L_0.894974574172_I_0.269141176471.jpg',
 'solar_Fri_Jun_16_10__0__46_2017_L_0.890988502987_I_0.261278431373.jpg',
 'solar_Fri_Jun_16_10__0__51_2017_L_0.890988502987_I_0.261278431373.jpg',
 'solar_Fri_Jun_16_10__0__56_2017_L_0.894472593275_I_0.255411764706.jpg',
 'solar_Fri_Jun_16_10__0__6_2017_L_0.906153208302_I_0.321592156863.jpg',
 'solar_Fri_Jun_16_10__10__17_2017_L_0.898027314112_I_0.284274509804.jpg',
 'solar_Fri_Jun_16_10__10__28_2017_L_0.892

### Preprocessing for the dataframe

In [29]:
examples = []
for file in files:
    parts = file.split('_')
    solar_data = {
        'type': parts[0], # 'solar'
        'day_of_week': parts[1], # day of the week
        'month': parts[2],
        'day': parts[3], # day
        'hour': parts[4], # hout
        'minute': parts[6], # minute
        'second': parts[8], # second
        'year': parts[9], # 'year'
        'loss_percentage': parts[11], # 'ageloss' with 'L%' removed
        'irradiance_level': parts[13][0:-4], # 'irradiancelevel' with 'I' removed
        'original_title': file,
    }
    examples.append(solar_data)

In [30]:
df = pd.DataFrame(examples)

In [31]:
df

Unnamed: 0,type,day_of_week,month,day,hour,minute,second,year,loss_percentage,irradiance_level,original_title
0,solar,Fri,Jun,16,10,0,11,2017,0.906153208302,0.321592156863,solar_Fri_Jun_16_10__0__11_2017_L_0.9061532083...
1,solar,Fri,Jun,16,10,0,16,2017,0.903081697073,0.293192156863,solar_Fri_Jun_16_10__0__16_2017_L_0.9030816970...
2,solar,Fri,Jun,16,10,0,1,2017,0.916698044034,0.39577254902,solar_Fri_Jun_16_10__0__1_2017_L_0.91669804403...
3,solar,Fri,Jun,16,10,0,21,2017,0.903081697073,0.293192156863,solar_Fri_Jun_16_10__0__21_2017_L_0.9030816970...
4,solar,Fri,Jun,16,10,0,26,2017,0.896087391118,0.27462745098,solar_Fri_Jun_16_10__0__26_2017_L_0.8960873911...
...,...,...,...,...,...,...,...,...,...,...,...
45749,solar,Wed,Jun,28,7,9,44,2017,0.0067850003029,0.0647333333333,solar_Wed_Jun_28_7__9__44_2017_L_0.00678500030...
45750,solar,Wed,Jun,28,7,9,49,2017,0.0067850003029,0.0647333333333,solar_Wed_Jun_28_7__9__49_2017_L_0.00678500030...
45751,solar,Wed,Jun,28,7,9,54,2017,0.0210669184468,0.0664549019608,solar_Wed_Jun_28_7__9__54_2017_L_0.02106691844...
45752,solar,Wed,Jun,28,7,9,59,2017,0.0210669184468,0.0664549019608,solar_Wed_Jun_28_7__9__59_2017_L_0.02106691844...


In [32]:
df[df['original_title'] == 'solar_Fri_Jun_23_15__55__48_2017_L_9.47445213972e-06_I_0.413909803922.jpg']

Unnamed: 0,type,day_of_week,month,day,hour,minute,second,year,loss_percentage,irradiance_level,original_title
4599,solar,Fri,Jun,23,15,55,48,2017,9.47445213972e-06,0.413909803922,solar_Fri_Jun_23_15__55__48_2017_L_9.474452139...


In [33]:
df['loss_percentage'] = pd.to_numeric(df['loss_percentage'], errors='coerce')
df['irradiance_level'] = pd.to_numeric(df['irradiance_level'])

In [39]:
import pandas as pd

# Month name to numerical mapping
month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
    'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
    'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

df = pd.DataFrame(examples)
df['loss_percentage'] = pd.to_numeric(df['loss_percentage'], errors='coerce')
df['irradiance_level'] = pd.to_numeric(df['irradiance_level'])
df['month'] = df['month'].map(month_mapping)

df['timestamp'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']])

df = df.sort_values(by='timestamp')

df = df.drop(columns=['timestamp'])

df


Unnamed: 0,type,day_of_week,month,day,hour,minute,second,year,loss_percentage,irradiance_level,original_title
28465,solar,Tue,6,13,9,46,49,2017,0.047484,0.296733,solar_Tue_Jun_13_9__46__49_2017_L_0.0474843723...
28466,solar,Tue,6,13,9,46,54,2017,0.027331,0.288831,solar_Tue_Jun_13_9__46__54_2017_L_0.0273312333...
28467,solar,Tue,6,13,9,46,59,2017,0.027331,0.288831,solar_Tue_Jun_13_9__46__59_2017_L_0.0273312333...
28468,solar,Tue,6,13,9,47,4,2017,0.039799,0.300631,solar_Tue_Jun_13_9__47__4_2017_L_0.03979859380...
28470,solar,Tue,6,13,9,47,9,2017,0.039799,0.300631,solar_Tue_Jun_13_9__47__9_2017_L_0.03979859380...
...,...,...,...,...,...,...,...,...,...,...,...
7738,solar,Fri,6,30,15,53,27,2017,0.567408,0.243176,solar_Fri_Jun_30_15__53__27_2017_L_0.567408482...
7739,solar,Fri,6,30,15,53,32,2017,0.567408,0.243176,solar_Fri_Jun_30_15__53__32_2017_L_0.567408482...
7742,solar,Fri,6,30,15,54,7,2017,0.565741,0.291667,solar_Fri_Jun_30_15__54__7_2017_L_0.5657411764...
7741,solar,Fri,6,30,15,54,12,2017,0.565741,0.291667,solar_Fri_Jun_30_15__54__12_2017_L_0.565741176...


In [40]:
df.dtypes

type                 object
day_of_week          object
month                 int64
day                  object
hour                 object
minute               object
second               object
year                 object
loss_percentage     float64
irradiance_level    float64
original_title       object
dtype: object

#### Create unique ID

In [41]:
# Reset the index to create a new column 'unique_id'
df['unique_id'] = df.reset_index().index

df

Unnamed: 0,type,day_of_week,month,day,hour,minute,second,year,loss_percentage,irradiance_level,original_title,unique_id
28465,solar,Tue,6,13,9,46,49,2017,0.047484,0.296733,solar_Tue_Jun_13_9__46__49_2017_L_0.0474843723...,0
28466,solar,Tue,6,13,9,46,54,2017,0.027331,0.288831,solar_Tue_Jun_13_9__46__54_2017_L_0.0273312333...,1
28467,solar,Tue,6,13,9,46,59,2017,0.027331,0.288831,solar_Tue_Jun_13_9__46__59_2017_L_0.0273312333...,2
28468,solar,Tue,6,13,9,47,4,2017,0.039799,0.300631,solar_Tue_Jun_13_9__47__4_2017_L_0.03979859380...,3
28470,solar,Tue,6,13,9,47,9,2017,0.039799,0.300631,solar_Tue_Jun_13_9__47__9_2017_L_0.03979859380...,4
...,...,...,...,...,...,...,...,...,...,...,...,...
7738,solar,Fri,6,30,15,53,27,2017,0.567408,0.243176,solar_Fri_Jun_30_15__53__27_2017_L_0.567408482...,45749
7739,solar,Fri,6,30,15,53,32,2017,0.567408,0.243176,solar_Fri_Jun_30_15__53__32_2017_L_0.567408482...,45750
7742,solar,Fri,6,30,15,54,7,2017,0.565741,0.291667,solar_Fri_Jun_30_15__54__7_2017_L_0.5657411764...,45751
7741,solar,Fri,6,30,15,54,12,2017,0.565741,0.291667,solar_Fri_Jun_30_15__54__12_2017_L_0.565741176...,45752


### Max Value and minimum value Per atribute

In [42]:
max_loss_percentage = df['loss_percentage'].max()
min_loss_percentage = df['loss_percentage'].min()

max_irradiance_level = df['irradiance_level'].max()
min_irradiance_level = df['irradiance_level'].min()

print("Max Loss Percentage:", max_loss_percentage)
print("Min Loss Percentage:", min_loss_percentage)
print("Max Irradiance Level:", max_irradiance_level)
print("Min Irradiance Level:", min_irradiance_level)

Max Loss Percentage: 0.996158869171
Min Loss Percentage: 0.0
Max Irradiance Level: 1.0061254902
Min Irradiance Level: 0.00268235294118


#### Visualization

In [14]:
bin_width = 0.1
min_value = df['loss_percentage'].min()
max_value = df['loss_percentage'].max()
num_bins = int((max_value - min_value) / bin_width)

# Create the histogram
plt.hist(df['loss_percentage'], bins=np.arange(min_value, max_value + bin_width, bin_width), edgecolor='black')

# Label axes and add a title
plt.xlabel('Loss Percentage')
plt.ylabel('Frequency')
plt.title('Distribution of Loss Percentages')

# Show the plot
plt.show()

TypeError: unsupported operand type(s) for -: 'str' and 'str'