In [5]:
import os
print(os.getcwd())  # This prints the current working directory


C:\Users\swath


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path1=  "C:/Users/swath/Downloads/Unemployment/Unemployment in India.csv"
file_path2=  "C:/Users/swath/Downloads/Unemployment/Unemployment_Rate_upto_11_2020.csv"

In [None]:
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

In [None]:
print("Dataset 1 Preview:")
display(df1.head())

In [None]:
print("Dataset 2 Preview:")
display(df2.head())

In [None]:
print("\nMissing Values in Dataset 1:")
print(df1.isnull().sum())

In [None]:
print("\nMissing Values in Dataset 2:")
print(df2.isnull().sum())

In [None]:
print("\nStatistical Summary of Dataset 1:")
print(df1.describe())
print("\nStatistical Summary of Dataset 2:")
print(df2.describe())

In [None]:
print("\nColumn Names in Dataset 1:")
print(df1.columns)
print("\nColumn Names in Dataset 2:")
print(df2.columns)

In [None]:
df1.columns = df1.columns.str.strip()
df2.columns = df2.columns.str.strip()

In [None]:
df1['Date'] = pd.to_datetime(df1['Date'], dayfirst=True)
df2['Date'] = pd.to_datetime(df2['Date'], dayfirst=True)


In [None]:
# Merge the datasets on common columns
df_merged = pd.merge(df1, df2, on=['Region', 'Date'], how='outer')

# Fill missing values from _y columns using _x columns
columns_to_fill = [
    'Estimated Unemployment Rate (%)_y',
    'Estimated Employed_y',
    'Estimated Labour Participation Rate (%)_y',
    'Frequency_y'
]

for col in columns_to_fill:
    col_x = col.replace("_y", "_x")  # Find the corresponding _x column
    if col_x in df_merged.columns and col in df_merged.columns:
        df_merged[col] = df_merged[col].combine_first(df_merged[col_x])

# Drop redundant _x columns
df_merged.drop(columns=[col.replace("_y", "_x") for col in columns_to_fill], inplace=True, errors='ignore')

# Rename columns properly
df_merged.rename(columns={
    'Estimated Unemployment Rate (%)_y': 'Estimated Unemployment Rate (%)',
    'Estimated Employed_y': 'Estimated Employed',
    'Estimated Labour Participation Rate (%)_y': 'Estimated Labour Participation Rate (%)',
    'Frequency_y': 'Frequency'
}, inplace=True)

# Drop unnecessary 'Region.1' column
df_merged.drop(columns=['Region.1'], inplace=True, errors='ignore')

# Display cleaned dataset
print("\nCleaned Merged Dataset:")
display(df_merged.head())


In [None]:
# Ensure column names are consistent
df2.rename(columns={'Longitude': 'longitude', 'Latitude': 'latitude'}, inplace=True)

# Merge using 'Region' instead of 'Region' and 'Date' (if location is constant for a region)
df_merged = pd.merge(df1, df2[['Region', 'longitude', 'latitude']], on='Region', how='left')

# Display the cleaned merged dataset
# display(df_merged.head())
df_merged


In [None]:
print(df1['Date'].dtype, df2['Date'].dtype)
print(df1['Date'].unique()[:5])  # Check first few unique dates in df1
print(df2['Date'].unique()[:5])  # Check first few unique dates in df2


In [None]:
df2_subset = df2[['Region', 'Zone', 'longitude', 'latitude']].drop_duplicates()

df_merged = pd.merge(df1, df2_subset, on='Region', how='left')
df1 = df1[df1['Date'] >= df2['Date'].min()]


In [None]:
# Ensure 'Date' is in the correct format
df1['Date'] = pd.to_datetime(df1['Date'])
df2['Date'] = pd.to_datetime(df2['Date'])

# Rename columns in df2 for consistency
df2.rename(columns={'Region.1': 'Zone'}, inplace=True)

# Select required columns from df2
df2_subset = df2[['Region', 'Date', 'Zone', 'longitude', 'latitude']]

# Merge the datasets on 'Region' and 'Date'
df_merged = pd.merge(df1, df2_subset, on=['Region', 'Date'], how='left')

# Display merged dataset preview
print("\nMerged Dataset Preview:")
display(df_merged.head())


In [None]:
df_merged

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x='Date', y='Estimated Unemployment Rate (%)', data=df_merged, marker='o', color='b')
plt.title("Unemployment Rate Over Time")
plt.xlabel("Date")
plt.ylabel("Estimated Unemployment Rate (%)")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df_merged['Estimated Unemployment Rate (%)'], bins=20, kde=True, color='r')
plt.title("Distribution of Estimated Unemployment Rate")
plt.xlabel("Estimated Unemployment (%)")
plt.ylabel("Frequency")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Region', y='Estimated Unemployment Rate (%)', data=df_merged)
plt.xticks(rotation=90)
plt.title("Unemployment Rate by Region")
plt.xlabel("Region")
plt.ylabel("Estimated Unemployment Rate (%)")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
numeric_df = df_merged.select_dtypes(include=['number'])  # Selecting only numerical columns
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='longitude', y='latitude', hue='Estimated Unemployment Rate (%)', size='Estimated Unemployment Rate (%)', data=df_merged, palette='coolwarm', sizes=(20, 200))
plt.title("Unemployment Rate by Location")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()


In [None]:
print("\nUnemployment Analysis:")
avg_unemployment = df_merged['Estimated Unemployment Rate (%)'].mean()
max_unemployment = df_merged['Estimated Unemployment Rate (%)'].max()
min_unemployment = df_merged['Estimated Unemployment Rate (%)'].min()
print(f"Average Unemployment Rate: {avg_unemployment:.2f}%")
print(f"Highest Unemployment Rate: {max_unemployment:.2f}%")
print(f"Lowest Unemployment Rate: {min_unemployment:.2f}%")

In [None]:
highest_region = df_merged[df_merged['Estimated Unemployment Rate (%)'] == max_unemployment]['Region'].values[0]
lowest_region = df_merged[df_merged['Estimated Unemployment Rate (%)'] == min_unemployment]['Region'].values[0]
print(f"Region with Highest Unemployment Rate: {highest_region}")
print(f"Region with Lowest Unemployment Rate: {lowest_region}")
