In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load your dataset
df = pd.read_csv("/content/adilabad.csv")

print(df)

               Mandal        Date  Rain (mm)  Min_Temp  Max_Temp  \
0      Adilabad Rural  01-01-2018        0.0       9.9      29.4   
1      Adilabad Rural  02-01-2018        0.0      11.4      31.1   
2      Adilabad Rural  03-01-2018        0.0      10.2      30.7   
3      Adilabad Rural  04-01-2018        0.0       9.9      30.5   
4      Adilabad Rural  05-01-2018        0.0      10.4      29.0   
...               ...         ...        ...       ...       ...   
31747           Utnur  27-10-2022        0.0      14.8      31.5   
31748           Utnur  28-10-2022        0.0      14.9      31.2   
31749           Utnur  29-10-2022        0.0      14.6      30.9   
31750           Utnur  30-10-2022        0.0      14.3      31.0   
31751           Utnur  31-10-2022        0.0      14.7      30.5   

       Min Humidity (%)  Max Humidity (%)  Min Wind Speed (Kmph)  \
0                  23.2              88.1                    0.0   
1                  20.0              82.5      

In [2]:
# Select only the numerical columns you want to standardize
numerical_cols = ['Rain (mm)', 'Min_Temp', 'Max_Temp', 'Min Humidity (%)', 'Max Humidity (%)', 'Min Wind Speed (Kmph)', 'Max Wind Speed (Kmph)',
                  'SO2','Nox','PM 10','PM 2.5','NH3','Petrol Vehicle','Diesel Vehicle','Electric Vehicle','1-3 seater',
                  '4-7 seater','8-20 seater','20+ seater','AQI']

# Standardize the numerical columns
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Save the standardized dataset to a new file
df.to_csv("standardized_data.csv", index=False)


In [3]:
df_1 = pd.read_csv("standardized_data.csv")
print(df_1)

               Mandal        Date  Rain (mm)  Min_Temp  Max_Temp  \
0      Adilabad Rural  01-01-2018        0.0  0.221538  0.411960   
1      Adilabad Rural  02-01-2018        0.0  0.267692  0.468439   
2      Adilabad Rural  03-01-2018        0.0  0.230769  0.455150   
3      Adilabad Rural  04-01-2018        0.0  0.221538  0.448505   
4      Adilabad Rural  05-01-2018        0.0  0.236923  0.398671   
...               ...         ...        ...       ...       ...   
31747           Utnur  27-10-2022        0.0  0.372308  0.481728   
31748           Utnur  28-10-2022        0.0  0.375385  0.471761   
31749           Utnur  29-10-2022        0.0  0.366154  0.461794   
31750           Utnur  30-10-2022        0.0  0.356923  0.465116   
31751           Utnur  31-10-2022        0.0  0.369231  0.448505   

       Min Humidity (%)  Max Humidity (%)  Min Wind Speed (Kmph)  \
0              0.239841             0.881               0.034247   
1              0.208127             0.825      

In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr

# df_1 = pd.get_dummies(df_1, columns=['Mandal','Date'])
# Split the data into features and target variable
X = df_1.drop(['AQI','Mandal','Date'], axis=1)
y = df_1['AQI']

# Calculate the correlation between each feature and the target variable
correlations = []
for col in X.columns:
    correlation, _ = pearsonr(X[col], y)
    correlations.append((col, correlation))

# Rank the features based on their correlation values
correlations.sort(key=lambda x: abs(x[1]), reverse=True)

# Select the top N features
N = 5
selected_features = [x[0] for x in correlations[:N]]

# Apply feature scaling to these N features
scaler = StandardScaler()
X[selected_features] = scaler.fit_transform(X[selected_features])


In [9]:
print(correlations)

[('PM 10', 0.9438332353722985), ('PM 2.5', 0.5623457330701723), ('NH3', -0.3522294102885297), ('Nox', 0.3205828060794532), ('SO2', -0.2561688637681149), ('Electric Vehicle', -0.25154800617542766), ('1-3 seater', 0.1892403342575068), ('Petrol Vehicle', 0.1868936530288925), ('Diesel Vehicle', 0.18143488408838163), ('4-7 seater', 0.1364794493226889), ('Rain (mm)', -0.11957606339959412), ('8-20 seater', 0.11597809178831468), ('Min Humidity (%)', -0.1101140016203242), ('Max Humidity (%)', -0.10812762373478366), ('Min_Temp', -0.08655409268566443), ('Min Wind Speed (Kmph)', 0.05500341677458708), ('20+ seater', 0.0368104363121579), ('Max_Temp', -0.031279821105150124), ('Max Wind Speed (Kmph)', -0.017820751157195322)]


In [10]:
print(selected_features)

['PM 10', 'PM 2.5', 'NH3', 'Nox', 'SO2']
