In [246]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [247]:
# Load the CSV file
df = pd.read_csv('C:\\Users\\Asus\\Documents\\A MINI PROJECT\\Bengaluru 2009-2019\\Bengaluru.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,moonrise,moonset,...,WindChillC,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph
0,01-01-2009 0.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,18,11,2,91,0.0,1014,14,10,109,8
1,01-01-2009 1.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,17,9,2,93,0.0,1014,14,7,85,6
2,01-01-2009 2.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,16,7,2,94,0.0,1014,13,5,61,4
3,01-01-2009 3.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,15,5,2,96,0.0,1014,12,2,37,3
4,01-01-2009 4.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,18,5,1,88,0.0,1015,14,5,45,3


In [248]:
# Check for missing values
print(df.isnull().sum())

date_time            0
maxtempC             0
mintempC             0
totalSnow_cm         0
sunHour              0
uvIndex              0
uvIndex.1            0
moon_illumination    0
moonrise             0
moonset              0
sunrise              0
sunset               0
DewPointC            0
FeelsLikeC           0
HeatIndexC           0
WindChillC           0
WindGustKmph         0
cloudcover           0
humidity             0
precipMM             0
pressure             0
tempC                0
visibility           0
winddirDegree        0
windspeedKmph        0
dtype: int64


In [249]:
# Display the dataframe info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96432 entries, 0 to 96431
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date_time          96432 non-null  object 
 1   maxtempC           96432 non-null  int64  
 2   mintempC           96432 non-null  int64  
 3   totalSnow_cm       96432 non-null  int64  
 4   sunHour            96432 non-null  float64
 5   uvIndex            96432 non-null  int64  
 6   uvIndex.1          96432 non-null  int64  
 7   moon_illumination  96432 non-null  int64  
 8   moonrise           96432 non-null  object 
 9   moonset            96432 non-null  object 
 10  sunrise            96432 non-null  object 
 11  sunset             96432 non-null  object 
 12  DewPointC          96432 non-null  int64  
 13  FeelsLikeC         96432 non-null  int64  
 14  HeatIndexC         96432 non-null  int64  
 15  WindChillC         96432 non-null  int64  
 16  WindGustKmph       964

In [197]:
# Convert date_time column to datetime with specified format
df['date_time'] = pd.to_datetime(df['date_time'], format='%d-%m-%Y %H.%M', errors='coerce')

In [198]:
# Extract the date part from the datetime column
df['day'] = df['date_time'].dt.date

In [199]:
# Group by the date and calculate the mean of each numeric parameter
# Select only numeric columns
numeric_columns = df.select_dtypes(include='number').columns

In [252]:
# Group by 'day' and calculate the mean for numeric columns
daily_average = daily_average.groupby('day')[numeric_columns].mean().reset_index()

In [255]:
# Create a binary column indicating rainy days

daily_average['rainy_day'] = daily_average['precipMM'].apply(lambda x: 1 if x > 0 else 0)

In [256]:
# Display the updated dataframe
daily_average.head()

Unnamed: 0,day,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,DewPointC,FeelsLikeC,...,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,rainy_day
0,2009-01-01,27.0,12.0,0.0,11.6,5.0,3.791667,31.0,13.75,21.416667,...,11.041667,6.958333,66.5,0.0,1013.875,19.041667,8.583333,84.208333,6.708333,0
1,2009-01-02,27.0,16.0,0.0,11.6,5.0,3.833333,38.0,14.541667,21.208333,...,12.125,36.416667,69.916667,0.0,1014.333333,20.625,7.833333,99.291667,7.958333,0
2,2009-01-03,25.0,15.0,0.0,11.6,5.0,3.708333,45.0,14.583333,20.958333,...,12.0,42.0,71.0,0.0,1015.0,20.166667,8.083333,99.416667,8.333333,0
3,2009-01-04,27.0,15.0,0.0,11.6,6.0,3.875,52.0,14.25,21.541667,...,11.541667,38.166667,67.291667,0.0,1014.25,20.791667,9.0,83.833333,7.625,0
4,2009-01-05,28.0,16.0,0.0,11.6,5.0,3.833333,60.0,14.291667,21.625,...,11.416667,25.125,67.083333,0.0,1012.708333,20.666667,8.75,104.875,7.5,0


In [257]:
# Display the resulting DataFrame
print(daily_average)

             day  maxtempC  mintempC  totalSnow_cm  sunHour  uvIndex  \
0     2009-01-01      27.0      12.0           0.0     11.6      5.0   
1     2009-01-02      27.0      16.0           0.0     11.6      5.0   
2     2009-01-03      25.0      15.0           0.0     11.6      5.0   
3     2009-01-04      27.0      15.0           0.0     11.6      6.0   
4     2009-01-05      28.0      16.0           0.0     11.6      5.0   
...          ...       ...       ...           ...      ...      ...   
4013  2019-12-28      26.0      18.0           0.0     11.6      6.0   
4014  2019-12-29      27.0      17.0           0.0     10.2      6.0   
4015  2019-12-30      25.0      17.0           0.0     10.2      5.0   
4016  2019-12-31      25.0      19.0           0.0     11.6      5.0   
4017  2020-01-01      26.0      18.0           0.0      8.7      6.0   

      uvIndex.1  moon_illumination  DewPointC  FeelsLikeC  ...  WindGustKmph  \
0      3.791667               31.0  13.750000   21.4166

In [258]:
daily_average.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4018 entries, 0 to 4017
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   day                4018 non-null   object 
 1   maxtempC           4018 non-null   float64
 2   mintempC           4018 non-null   float64
 3   totalSnow_cm       4018 non-null   float64
 4   sunHour            4018 non-null   float64
 5   uvIndex            4018 non-null   float64
 6   uvIndex.1          4018 non-null   float64
 7   moon_illumination  4018 non-null   float64
 8   DewPointC          4018 non-null   float64
 9   FeelsLikeC         4018 non-null   float64
 10  HeatIndexC         4018 non-null   float64
 11  WindChillC         4018 non-null   float64
 12  WindGustKmph       4018 non-null   float64
 13  cloudcover         4018 non-null   float64
 14  humidity           4018 non-null   float64
 15  precipMM           4018 non-null   float64
 16  pressure           4018 

In [259]:
daily_average.head()

Unnamed: 0,day,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,DewPointC,FeelsLikeC,...,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,rainy_day
0,2009-01-01,27.0,12.0,0.0,11.6,5.0,3.791667,31.0,13.75,21.416667,...,11.041667,6.958333,66.5,0.0,1013.875,19.041667,8.583333,84.208333,6.708333,0
1,2009-01-02,27.0,16.0,0.0,11.6,5.0,3.833333,38.0,14.541667,21.208333,...,12.125,36.416667,69.916667,0.0,1014.333333,20.625,7.833333,99.291667,7.958333,0
2,2009-01-03,25.0,15.0,0.0,11.6,5.0,3.708333,45.0,14.583333,20.958333,...,12.0,42.0,71.0,0.0,1015.0,20.166667,8.083333,99.416667,8.333333,0
3,2009-01-04,27.0,15.0,0.0,11.6,6.0,3.875,52.0,14.25,21.541667,...,11.541667,38.166667,67.291667,0.0,1014.25,20.791667,9.0,83.833333,7.625,0
4,2009-01-05,28.0,16.0,0.0,11.6,5.0,3.833333,60.0,14.291667,21.625,...,11.416667,25.125,67.083333,0.0,1012.708333,20.666667,8.75,104.875,7.5,0


In [261]:
# Save the dataframe to a new CSV file
#daily_average.to_csv('daily_average.csv', index=False)

In [262]:
monthly_rain.to_csv('daily.csv', index=False)
#daily_average.head()


In [264]:
daily_average['year'] = pd.to_datetime(daily_average['day'], format='%d-%m-%Y').dt.year
daily_average['month'] = pd.to_datetime(daily_average['day'], format='%d-%m-%Y').dt.month
#daily_average['date'] = pd.to_datetime(daily_average['day'], format='%d-%m-%Y').dt.date


In [268]:
#daily_average.head()


In [269]:
print(daily_average)


             day  maxtempC  mintempC  totalSnow_cm  sunHour  uvIndex  \
0     2009-01-01      27.0      12.0           0.0     11.6      5.0   
1     2009-01-02      27.0      16.0           0.0     11.6      5.0   
2     2009-01-03      25.0      15.0           0.0     11.6      5.0   
3     2009-01-04      27.0      15.0           0.0     11.6      6.0   
4     2009-01-05      28.0      16.0           0.0     11.6      5.0   
...          ...       ...       ...           ...      ...      ...   
4013  2019-12-28      26.0      18.0           0.0     11.6      6.0   
4014  2019-12-29      27.0      17.0           0.0     10.2      6.0   
4015  2019-12-30      25.0      17.0           0.0     10.2      5.0   
4016  2019-12-31      25.0      19.0           0.0     11.6      5.0   
4017  2020-01-01      26.0      18.0           0.0      8.7      6.0   

      uvIndex.1  moon_illumination  DewPointC  FeelsLikeC  ...  precipMM  \
0      3.791667               31.0  13.750000   21.416667  

In [270]:

monthly_rain = daily_average.groupby([ 'month','year'])['rainy_day'].sum().reset_index()

In [271]:
monthly_rain.to_csv('monthly_rain.csv', index=False)

In [272]:
print(monthly_rain)

     month  year  rainy_day
0        1  2009          4
1        1  2010          9
2        1  2011          3
3        1  2012          1
4        1  2013          0
..     ...   ...        ...
128     12  2015         10
129     12  2016          7
130     12  2017          5
131     12  2018          9
132     12  2019         10

[133 rows x 3 columns]


In [273]:
# Optional: Rename columns for clarity
monthly_rain.rename(columns={'date_time.year': 'Year', 'date_time.month': 'Month'}, inplace=True)

In [274]:

# Print or save the result
print(monthly_rain)
# monthly_rain.to_csv('monthly_rain.csv', index=False)

     month  year  rainy_day
0        1  2009          4
1        1  2010          9
2        1  2011          3
3        1  2012          1
4        1  2013          0
..     ...   ...        ...
128     12  2015         10
129     12  2016          7
130     12  2017          5
131     12  2018          9
132     12  2019         10

[133 rows x 3 columns]


In [284]:
#print(daily_average['precipMM'].dtype)
#print(daily_average['rainy_day'].dtype)


float64
int64
