In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [36]:
# Load the CSV file
df = pd.read_csv('C:\\Users\\Asus\\Documents\\A MINI PROJECT\\Bengaluru 2009-2019\\Bengaluru.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,moonrise,moonset,...,WindChillC,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph
0,01-01-2009 0.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,18,11,2,91,0.0,1014,14,10,109,8
1,01-01-2009 1.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,17,9,2,93,0.0,1014,14,7,85,6
2,01-01-2009 2.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,16,7,2,94,0.0,1014,13,5,61,4
3,01-01-2009 3.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,15,5,2,96,0.0,1014,12,2,37,3
4,01-01-2009 4.00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,18,5,1,88,0.0,1015,14,5,45,3


In [37]:
# Check for missing values
print(df.isnull().sum())

date_time            0
maxtempC             0
mintempC             0
totalSnow_cm         0
sunHour              0
uvIndex              0
uvIndex.1            0
moon_illumination    0
moonrise             0
moonset              0
sunrise              0
sunset               0
DewPointC            0
FeelsLikeC           0
HeatIndexC           0
WindChillC           0
WindGustKmph         0
cloudcover           0
humidity             0
precipMM             0
pressure             0
tempC                0
visibility           0
winddirDegree        0
windspeedKmph        0
dtype: int64


In [38]:
# Display the dataframe info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96432 entries, 0 to 96431
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date_time          96432 non-null  object 
 1   maxtempC           96432 non-null  int64  
 2   mintempC           96432 non-null  int64  
 3   totalSnow_cm       96432 non-null  int64  
 4   sunHour            96432 non-null  float64
 5   uvIndex            96432 non-null  int64  
 6   uvIndex.1          96432 non-null  int64  
 7   moon_illumination  96432 non-null  int64  
 8   moonrise           96432 non-null  object 
 9   moonset            96432 non-null  object 
 10  sunrise            96432 non-null  object 
 11  sunset             96432 non-null  object 
 12  DewPointC          96432 non-null  int64  
 13  FeelsLikeC         96432 non-null  int64  
 14  HeatIndexC         96432 non-null  int64  
 15  WindChillC         96432 non-null  int64  
 16  WindGustKmph       964

In [39]:
# Convert date_time column to datetime with specified format
df['date_time'] = pd.to_datetime(df['date_time'], format='%d-%m-%Y %H.%M', errors='coerce')

In [40]:
# Extract the date part from the datetime column
df['day'] = df['date_time'].dt.date

In [41]:
# Group by the date and calculate the mean of each numeric parameter
# Select only numeric columns
numeric_columns = df.select_dtypes(include='number').columns

In [42]:
# Group by 'day' and calculate the mean for numeric columns
daily_average = df.groupby('day')[numeric_columns].mean().reset_index()

In [43]:
# Create a binary column indicating rainy days

df['rainy_day'] = df['precipMM'].apply(lambda x: 1 if x > 0 else 0)

In [44]:
# Display the updated dataframe
df.head()

Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,moonrise,moonset,...,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,day,rainy_day
0,2009-01-01 00:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,2,91,0.0,1014,14,10,109,8,2009-01-01,0
1,2009-01-01 01:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,2,93,0.0,1014,14,7,85,6,2009-01-01,0
2,2009-01-01 02:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,2,94,0.0,1014,13,5,61,4,2009-01-01,0
3,2009-01-01 03:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,2,96,0.0,1014,12,2,37,3,2009-01-01,0
4,2009-01-01 04:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,1,88,0.0,1015,14,5,45,3,2009-01-01,0


In [47]:
# Display the resulting DataFrame
print(daily_average)

             day  maxtempC  mintempC  totalSnow_cm  sunHour  uvIndex  \
0     2009-01-01      27.0      12.0           0.0     11.6      5.0   
1     2009-01-02      27.0      16.0           0.0     11.6      5.0   
2     2009-01-03      25.0      15.0           0.0     11.6      5.0   
3     2009-01-04      27.0      15.0           0.0     11.6      6.0   
4     2009-01-05      28.0      16.0           0.0     11.6      5.0   
...          ...       ...       ...           ...      ...      ...   
4013  2019-12-28      26.0      18.0           0.0     11.6      6.0   
4014  2019-12-29      27.0      17.0           0.0     10.2      6.0   
4015  2019-12-30      25.0      17.0           0.0     10.2      5.0   
4016  2019-12-31      25.0      19.0           0.0     11.6      5.0   
4017  2020-01-01      26.0      18.0           0.0      8.7      6.0   

      uvIndex.1  moon_illumination  DewPointC  FeelsLikeC  ...  WindChillC  \
0      3.791667               31.0  13.750000   21.416667

In [48]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96432 entries, 0 to 96431
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date_time          96432 non-null  datetime64[ns]
 1   maxtempC           96432 non-null  int64         
 2   mintempC           96432 non-null  int64         
 3   totalSnow_cm       96432 non-null  int64         
 4   sunHour            96432 non-null  float64       
 5   uvIndex            96432 non-null  int64         
 6   uvIndex.1          96432 non-null  int64         
 7   moon_illumination  96432 non-null  int64         
 8   moonrise           96432 non-null  object        
 9   moonset            96432 non-null  object        
 10  sunrise            96432 non-null  object        
 11  sunset             96432 non-null  object        
 12  DewPointC          96432 non-null  int64         
 13  FeelsLikeC         96432 non-null  int64         
 14  HeatIn

In [32]:
df.head()

Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,moonrise,moonset,...,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,day
0,2009-01-01 00:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,11,2,91,0.0,1014,14,10,109,8,2009-01-01
1,2009-01-01 01:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,9,2,93,0.0,1014,14,7,85,6,2009-01-01
2,2009-01-01 02:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,7,2,94,0.0,1014,13,5,61,4,2009-01-01
3,2009-01-01 03:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,5,2,96,0.0,1014,12,2,37,3,2009-01-01
4,2009-01-01 04:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,5,1,88,0.0,1015,14,5,45,3,2009-01-01


In [49]:
# Save the dataframe to a new CSV file
daily_average.to_csv('daily_average.csv', index=False)

In [50]:
df.head()


Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,moonrise,moonset,...,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,day,rainy_day
0,2009-01-01 00:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,2,91,0.0,1014,14,10,109,8,2009-01-01,0
1,2009-01-01 01:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,2,93,0.0,1014,14,7,85,6,2009-01-01,0
2,2009-01-01 02:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,2,94,0.0,1014,13,5,61,4,2009-01-01,0
3,2009-01-01 03:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,2,96,0.0,1014,12,2,37,3,2009-01-01,0
4,2009-01-01 04:00:00,27,12,0,11.6,5,1,31,9.58 AM,10.03 PM,...,1,88,0.0,1015,14,5,45,3,2009-01-01,0
