<a href="https://colab.research.google.com/github/yiyukk/DeloitteProject/blob/main/Sociodemographic_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive


## Data Cleaning - Sociodemographic

In [None]:
#load the dataset in xlsx format
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define the base path and data path
base_path = '/content/drive/Shared drives/Deloitte'
data_path = base_path + '/Original Data'

# Load the Excel file
socio = pd.read_excel(data_path + '/Sociodemographic.xlsx')

# Display the first few rows of the dataframe
socio.head(50)


Mounted at /content/drive


Unnamed: 0,City,Province,Population,Young Population,GDB per capita,Unemployment rate
0,Pamplona,Navarra,197488,20539,29134,"8,76%"
1,A Coruña,A Coruña,246047,29526,21898,"10,92%"
2,Valladolid,Valladolid,315522,20193,24176,"9,35%"
3,alicante,Alicante,33441,6153,17405,"14,13%"
4,Bilbao,Vizcaya,353187,27195,28618,"9,28%"
5,Zaragoza,Zaragoza,675121,36457,25150,"8,50%"
6,Sevilla,Sevilla,704198,54223,18223,"17,89%"
7,Barcelona,Barcelona,1619337,119831,26531,"9,18%"
8,Madrid,Madrid,3273049,209475,29576,1018
9,valencia,Valencia,2605764,279250,23647,"13,78%"


In [None]:
#checking the shape of the dataset

socio.shape

(15, 6)

In [None]:
#checking for duplicates
socio.duplicated().sum()

0

In [None]:
#checking for null values
socio.isna().sum()

City                 0
Province             0
Population           0
Young Population     0
GDB per capita       0
Unemployment rate    0
dtype: int64

In [None]:
#Checking for right column names
print(socio.columns)

Index(['City', 'Province', 'Population', 'Young Population', 'GDB per capita',
       'Unemployment rate'],
      dtype='object')


In [None]:
#Clean column names
socio.columns = socio.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('.', '_')
print(socio.columns)

Index(['city', 'province', 'population', 'young_population', 'gdb_per_capita',
       'unemployment_rate'],
      dtype='object')


In [None]:
socio = socio.rename(columns={'gdb_per_capita': 'gdp_per_capita'})

In [None]:
socio['city'].unique()


array(['Pamplona', 'A Coruña', 'Valladolid', 'alicante', 'Bilbao',
       'Zaragoza', 'Sevilla', 'Barcelona', ' Madrid ', 'valencia',
       'Palencia', 'Guadalajara', 'Caceres', ' Burgos', 'murcia'],
      dtype=object)

In [None]:
#Cities contain spaces before or after the name so we want to strip this and capitalize
socio['city'] = socio['city'].str.strip().str.capitalize()

In [None]:
socio['city'].unique()

array(['Pamplona', 'A coruña', 'Valladolid', 'Alicante', 'Bilbao',
       'Zaragoza', 'Sevilla', 'Barcelona', 'Madrid', 'Valencia',
       'Palencia', 'Guadalajara', 'Caceres', 'Burgos', 'Murcia'],
      dtype=object)

In [None]:
#Align all the names

city_mapping = {
    'A coruña': 'A Coruña'
}

socio['city'] = socio['city'].replace(city_mapping)

print(socio['city'].unique())

['Pamplona' 'A Coruña' 'Valladolid' 'Alicante' 'Bilbao' 'Zaragoza'
 'Sevilla' 'Barcelona' 'Madrid' 'Valencia' 'Palencia' 'Guadalajara'
 'Caceres' 'Burgos' 'Murcia']


## Unemployment Rate

In [None]:
#We want to change the unemployment rate to a rate instead of a %
socio['unemployment_rate'] = socio['unemployment_rate'].str.replace('%', '').str.replace(',', '.').astype(float) / 100

socio.head()

Unnamed: 0,city,province,population,young_population,gdp_per_capita,unemployment_rate
0,Pamplona,Navarra,197488,20539,29134,0.0876
1,A Coruña,A Coruña,246047,29526,21898,0.1092
2,Valladolid,Valladolid,315522,20193,24176,0.0935
3,Alicante,Alicante,33441,6153,17405,0.1413
4,Bilbao,Vizcaya,353187,27195,28618,0.0928


In [None]:
socio.head()

Unnamed: 0,city,province,population,young_population,gdp_per_capita,unemployment_rate
0,Pamplona,Navarra,197488,20539,29134,0.0876
1,A Coruña,A Coruña,246047,29526,21898,0.1092
2,Valladolid,Valladolid,315522,20193,24176,0.0935
3,Alicante,Alicante,33441,6153,17405,0.1413
4,Bilbao,Vizcaya,353187,27195,28618,0.0928


### Save the dataset to a new version containing the cleaned data set with all the columns

In [None]:
socio.to_csv("socio_V2.csv", index=False)

# Dropping all the columns that are not relevant

In [None]:
socio_v2 = pd.read_csv('socio_V2.csv')

socio_v2.head()

Unnamed: 0,city,province,population,young_population,gdp_per_capita,unemployment_rate
0,Pamplona,Navarra,197488,20539,29134,0.0876
1,A Coruña,A Coruña,246047,29526,21898,0.1092
2,Valladolid,Valladolid,315522,20193,24176,0.0935
3,Alicante,Alicante,33441,6153,17405,0.1413
4,Bilbao,Vizcaya,353187,27195,28618,0.0928


In [None]:
#Delete all not needed columns, we dropped provice as every province only has 1 city
socio_v2 = socio_v2.drop(
    columns =[
    'province'
])

socio_v2.head()

Unnamed: 0,city,population,young_population,gdp_per_capita,unemployment_rate
0,Pamplona,197488,20539,29134,0.0876
1,A Coruña,246047,29526,21898,0.1092
2,Valladolid,315522,20193,24176,0.0935
3,Alicante,33441,6153,17405,0.1413
4,Bilbao,353187,27195,28618,0.0928


# Save as a clean data set with all columns

In [None]:
output_dir = '/content/drive/Shared drives/Deloitte/Cleaned Data'

socio_v2.to_csv(output_dir + "/socio_cleaned.csv", index=False)




# Dropping columns not needed for EDA