In [1]:
import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

#### Data Directory


In [2]:
# folder path
data_folder = 'SolarPanelSoilingImageDataset\Solar_Panel_Soiling_Image_dataset\PanelImages_originaldata/'

files = os.listdir(data_folder)

files[0:10]


['solar_Fri_Jun_16_10__0__11_2017_L_0.906153208302_I_0.321592156863.jpg',
 'solar_Fri_Jun_16_10__0__16_2017_L_0.903081697073_I_0.293192156863.jpg',
 'solar_Fri_Jun_16_10__0__1_2017_L_0.916698044034_I_0.39577254902.jpg',
 'solar_Fri_Jun_16_10__0__21_2017_L_0.903081697073_I_0.293192156863.jpg',
 'solar_Fri_Jun_16_10__0__26_2017_L_0.896087391118_I_0.27462745098.jpg',
 'solar_Fri_Jun_16_10__0__31_2017_L_0.896087391118_I_0.27462745098.jpg',
 'solar_Fri_Jun_16_10__0__36_2017_L_0.894974574172_I_0.269141176471.jpg',
 'solar_Fri_Jun_16_10__0__41_2017_L_0.894974574172_I_0.269141176471.jpg',
 'solar_Fri_Jun_16_10__0__46_2017_L_0.890988502987_I_0.261278431373.jpg',
 'solar_Fri_Jun_16_10__0__51_2017_L_0.890988502987_I_0.261278431373.jpg']

### Preprocessing

#### Breaking down string into time component columns


In [5]:
examples = []
for file in files:
    parts = file.split('_')
    solar_data = {
        'type': parts[0], # 'solar'
        'day_of_week': parts[1], # day of the week
        'month': parts[2],
        'day': parts[3], # day
        'hour': parts[4], # hout
        'minute': parts[6], # minute
        'second': parts[8], # second
        'year': parts[9], # 'year'
        'loss_percentage': parts[11], # 'ageloss' with 'L%' removed
        'irradiance_level': parts[13][0:-4], # 'irradiancelevel' with 'I' removed
        'original_title': file,
    }
    examples.append(solar_data)

df = pd.DataFrame(examples)


In [6]:
df

Unnamed: 0,type,day_of_week,month,day,hour,minute,second,year,loss_percentage,irradiance_level,original_title
0,solar,Fri,Jun,16,10,0,11,2017,0.906153208302,0.321592156863,solar_Fri_Jun_16_10__0__11_2017_L_0.9061532083...
1,solar,Fri,Jun,16,10,0,16,2017,0.903081697073,0.293192156863,solar_Fri_Jun_16_10__0__16_2017_L_0.9030816970...
2,solar,Fri,Jun,16,10,0,1,2017,0.916698044034,0.39577254902,solar_Fri_Jun_16_10__0__1_2017_L_0.91669804403...
3,solar,Fri,Jun,16,10,0,21,2017,0.903081697073,0.293192156863,solar_Fri_Jun_16_10__0__21_2017_L_0.9030816970...
4,solar,Fri,Jun,16,10,0,26,2017,0.896087391118,0.27462745098,solar_Fri_Jun_16_10__0__26_2017_L_0.8960873911...
...,...,...,...,...,...,...,...,...,...,...,...
45749,solar,Wed,Jun,28,7,9,44,2017,0.0067850003029,0.0647333333333,solar_Wed_Jun_28_7__9__44_2017_L_0.00678500030...
45750,solar,Wed,Jun,28,7,9,49,2017,0.0067850003029,0.0647333333333,solar_Wed_Jun_28_7__9__49_2017_L_0.00678500030...
45751,solar,Wed,Jun,28,7,9,54,2017,0.0210669184468,0.0664549019608,solar_Wed_Jun_28_7__9__54_2017_L_0.02106691844...
45752,solar,Wed,Jun,28,7,9,59,2017,0.0210669184468,0.0664549019608,solar_Wed_Jun_28_7__9__59_2017_L_0.02106691844...


#### Drop unnecessary Columns

In [8]:
# List of columns to drop
columns_to_drop = ['type', 'day_of_week']

# Dropping the specified columns
df = df.drop(columns=columns_to_drop)

#### Month Mapping

In [12]:
# Month name to numerical mapping
month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
    'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
    'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

df['month'] = df['month'].map(month_mapping)


#### Convert dtypes to numeric

In [17]:
columns_to_convert = ['day', 'hour', 'minute', 'second', 'year', 'loss_percentage', 'irradiance_level']

# Loop through each column and convert to numeric
for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce')


#### Ordering the dataframe per timestamp

In [28]:
df['timestamp'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']])

df = df.sort_values(by='timestamp')

df = df.reset_index(drop=True)


In [29]:
df

Unnamed: 0,month,day,hour,minute,second,year,loss_percentage,irradiance_level,original_title,timestamp
0,6,13,9,46,49,2017,0.047484,0.296733,solar_Tue_Jun_13_9__46__49_2017_L_0.0474843723...,2017-06-13 09:46:49
1,6,13,9,46,54,2017,0.027331,0.288831,solar_Tue_Jun_13_9__46__54_2017_L_0.0273312333...,2017-06-13 09:46:54
2,6,13,9,46,59,2017,0.027331,0.288831,solar_Tue_Jun_13_9__46__59_2017_L_0.0273312333...,2017-06-13 09:46:59
3,6,13,9,47,4,2017,0.039799,0.300631,solar_Tue_Jun_13_9__47__4_2017_L_0.03979859380...,2017-06-13 09:47:04
4,6,13,9,47,9,2017,0.039799,0.300631,solar_Tue_Jun_13_9__47__9_2017_L_0.03979859380...,2017-06-13 09:47:09
...,...,...,...,...,...,...,...,...,...,...
45749,6,30,15,53,27,2017,0.567408,0.243176,solar_Fri_Jun_30_15__53__27_2017_L_0.567408482...,2017-06-30 15:53:27
45750,6,30,15,53,32,2017,0.567408,0.243176,solar_Fri_Jun_30_15__53__32_2017_L_0.567408482...,2017-06-30 15:53:32
45751,6,30,15,54,7,2017,0.565741,0.291667,solar_Fri_Jun_30_15__54__7_2017_L_0.5657411764...,2017-06-30 15:54:07
45752,6,30,15,54,12,2017,0.565741,0.291667,solar_Fri_Jun_30_15__54__12_2017_L_0.565741176...,2017-06-30 15:54:12


In [30]:
# Create a clone (copy) of the dataset
df_copy = df.copy()

# Calculate the time difference between consecutive rows
df_copy['time_diff'] = df_copy['timestamp'].diff()

# Filter rows based on the 20-second range or date change condition
filtered_df = df_copy[(df_copy['time_diff'].dt.total_seconds() > 20) |
                      (df_copy['timestamp'].dt.date != df_copy['timestamp'].shift().dt.date) |
                      df_copy['time_diff'].isna()]

# Drop unnecessary columns
filtered_df = filtered_df.drop(['time_diff'], axis=1)

# Reset index if needed
#filtered_df = filtered_df.reset_index(drop=True)

filtered_df


Unnamed: 0,month,day,hour,minute,second,year,loss_percentage,irradiance_level,original_title,timestamp
0,6,13,9,46,49,2017,0.047484,0.296733,solar_Tue_Jun_13_9__46__49_2017_L_0.0474843723...,2017-06-13 09:46:49
5,6,13,9,47,54,2017,0.021836,0.613851,solar_Tue_Jun_13_9__47__54_2017_L_0.0218357907...,2017-06-13 09:47:54
12,6,13,9,49,36,2017,0.449538,0.652835,solar_Tue_Jun_13_9__49__36_2017_L_0.4495383635...,2017-06-13 09:49:36
19,6,13,9,51,32,2017,0.061690,0.376580,solar_Tue_Jun_13_9__51__32_2017_L_0.0616903403...,2017-06-13 09:51:32
21,6,13,9,52,2,2017,0.322995,0.432800,solar_Tue_Jun_13_9__52__2_2017_L_0.32299481715...,2017-06-13 09:52:02
...,...,...,...,...,...,...,...,...,...,...
45593,6,30,15,20,46,2017,0.566062,0.305953,solar_Fri_Jun_30_15__20__46_2017_L_0.566061678...,2017-06-30 15:20:46
45605,6,30,15,22,17,2017,0.557589,0.295945,solar_Fri_Jun_30_15__22__17_2017_L_0.557588847...,2017-06-30 15:22:17
45745,6,30,15,53,7,2017,0.559479,0.226212,solar_Fri_Jun_30_15__53__7_2017_L_0.5594792316...,2017-06-30 15:53:07
45751,6,30,15,54,7,2017,0.565741,0.291667,solar_Fri_Jun_30_15__54__7_2017_L_0.5657411764...,2017-06-30 15:54:07


In [32]:

df_copy = df.copy()


# Initialize a list to store the indices to keep
indices_to_keep = [df.index[0]]

# Iterate through the DataFrame and keep rows that are more than 20 seconds apart
for i in range(1, len(df)):
    time_difference = (df['timestamp'].iloc[i] - df['timestamp'].iloc[indices_to_keep[-1]]).seconds
    if time_difference > 20:
        indices_to_keep.append(i)

# Filter the DataFrame based on the selected indices
filtered_df = df.loc[indices_to_keep]

# Drop the timestamp column if you don't need it in the final result
filtered_df = filtered_df.drop(columns='timestamp')

# Print the result
filtered_df

Unnamed: 0,month,day,hour,minute,second,year,loss_percentage,irradiance_level,original_title
0,6,13,9,46,49,2017,0.047484,0.296733,solar_Tue_Jun_13_9__46__49_2017_L_0.0474843723...
5,6,13,9,47,54,2017,0.021836,0.613851,solar_Tue_Jun_13_9__47__54_2017_L_0.0218357907...
8,6,13,9,48,20,2017,0.142282,0.620973,solar_Tue_Jun_13_9__48__20_2017_L_0.1422815570...
11,6,13,9,48,50,2017,0.082342,0.465396,solar_Tue_Jun_13_9__48__50_2017_L_0.0823418382...
12,6,13,9,49,36,2017,0.449538,0.652835,solar_Tue_Jun_13_9__49__36_2017_L_0.4495383635...
...,...,...,...,...,...,...,...,...,...
45740,6,30,15,36,51,2017,0.514036,0.323675,solar_Fri_Jun_30_15__36__51_2017_L_0.514036129...
45745,6,30,15,53,7,2017,0.559479,0.226212,solar_Fri_Jun_30_15__53__7_2017_L_0.5594792316...
45750,6,30,15,53,32,2017,0.567408,0.243176,solar_Fri_Jun_30_15__53__32_2017_L_0.567408482...
45751,6,30,15,54,7,2017,0.565741,0.291667,solar_Fri_Jun_30_15__54__7_2017_L_0.5657411764...
