We will work with the following 3 datasets:

- Unemployment (JSON);
- Income (CSV);
- Incidences (CSV);

## Libraries

In [32]:
import pandas as pd 
import numpy as np
import os
from pathlib import Path
import json

In [33]:
# Path to 'datasets' folder
datasets_path = Path.cwd() / "datasets"

# Get all subfolders inside 'datasets'
folders = sorted([f for f in datasets_path.iterdir() if f.is_dir()])

# Initialize a list to store the final dataframes
dfs = []

# Loop through the first 3 folders (or all if you want)
for folder in folders[:3]:
    print(f"\n Processing folder: {folder.name}")
    dataframes = []

    for file in folder.iterdir():
        if not file.is_file():
            continue

        try:
            if file.suffix == ".csv":
                df = pd.read_csv(file)
                print(f"  ✔ Read CSV: {file.name} | Shape: {df.shape}")
                dataframes.append(df)

            elif file.suffix == ".json":
                with open(file, 'r', encoding='utf-8') as f:
                    raw_json = json.load(f)

                if raw_json.get("success") is False:
                    print(f"  ⚠ Skipped invalid JSON (API error): {file.name}")
                    continue

                # Try to extract records from structured JSON (if available)
                if "result" in raw_json and "records" in raw_json["result"]:
                    df = pd.json_normalize(raw_json["result"]["records"])
                else:
                    # Assume it’s flat or list of dicts
                    df = pd.json_normalize(raw_json)

                print(f"  ✔ Read JSON: {file.name} | Shape: {df.shape}")
                dataframes.append(df)

        except Exception as e:
            print(f" Failed to load {file.name}: {e}")

    # Concatenate all files in this folder
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        dfs.append(combined_df)
        print(f" Combined DataFrame for {folder.name}: {combined_df.shape}")
    else:
        dfs.append(None)
        print(f" No valid data files found in {folder.name}")

# Unpack into df1, df2, df3
df1, df2, df3 = dfs[:3]

# Show sample outputs
if df1 is not None:
    print("\n DF1 Sample:\n", df1.head())
if df2 is not None:
    print("\n DF2 Sample:\n", df2.head())
if df3 is not None:
    print("\n DF3 Sample:\n", df3.head())



 Processing folder: incidences
  ✔ Read CSV: 2018_Peticions_ciutadanes.csv | Shape: (100, 26)
  ✔ Read CSV: 2014_Peticions_ciutadanes.csv | Shape: (100, 26)
  ✔ Read CSV: 2015_Peticions_ciutadanes.csv | Shape: (100, 26)
  ✔ Read CSV: 2021_IRIS_Peticions_Ciutadanes_OpenData.csv | Shape: (100, 26)
  ✔ Read CSV: 2020_IRIS_Peticions_Ciutadanes_OpenData.csv | Shape: (100, 26)
  ✔ Read CSV: 2017_Peticions_ciutadanes.csv | Shape: (100, 26)
  ✔ Read CSV: 2022_IRIS_Peticions_Ciutadanes_OpenData.csv | Shape: (100, 26)
  ✔ Read CSV: 2019_Peticions_ciutadanes.csv | Shape: (100, 26)
  ✔ Read CSV: 2016_Peticions_ciutadanes.csv | Shape: (100, 26)
 Combined DataFrame for incidences: (900, 26)

 Processing folder: income
  ✔ Read CSV: 2015_Distribucio_territorial_renda_familiar.csv | Shape: (74, 7)
  ✔ Read CSV: 2008_Distribucio_territorial_renda_familiar.csv | Shape: (73, 7)
  ✔ Read CSV: 2013_Distribucio_territorial_renda_familiar.csv | Shape: (74, 7)
  ✔ Read CSV: 2010_Distribucio_territorial_renda

In [34]:
incident_df = df1.copy()
incident_df.head()

Unnamed: 0,CODI_DISTRICTE,DISTRICTE,ANY_DATA_TANCAMENT,DETALL,BARRI,FITXA_ID,LATITUD,AREA,COORDENADA_Y,COORDENADA_X,...,MES_DATA_TANCAMENT,TIPUS_VIA,ANY_DATA_ALTA,MES_DATA_ALTA,SUPORT,CARRER,CODI_BARRI,_id,TIPUS,DIA_DATA_TANCAMENT
0,5.0,Sarrià-Sant Gervasi,2018,Vehicles motor abandonats,Sant Gervasi - la Bonanova,573739,41.41143,Mobilitat,85004982.0,26931896.0,...,1,Carrer,2017,12,WEB,Bellesguard,25.0,1,INCIDENCIA,1
1,5.0,Sarrià-Sant Gervasi,2018,Vehicles motor abandonats,les Tres Torres,573738,41.39589,Mobilitat,83273451.0,27536884.0,...,1,Carrer,2017,12,TELÈFON,Doctor Roux,24.0,2,INCIDENCIA,1
2,2.0,Eixample,2018,Objectes a netejar / retirar,la Nova Esquerra de l'Eixample,573728,41.38189,Recollida i neteja de l'espai urbà,81702349.0,29214771.0,...,1,Carrer,2017,12,TELÈFON,Calàbria,9.0,3,INCIDENCIA,1
3,8.0,Nou Barris,2018,Recollida animals morts espai públic,la Guineueta,573730,41.43997,Recollida i neteja de l'espai urbà,88140422.0,30351204.0,...,1,Via,2017,12,TELÈFON,Favència,48.0,4,INCIDENCIA,1
4,6.0,Gràcia,2018,Objectes a netejar / retirar,la Vila de Gràcia,573729,41.39755,Recollida i neteja de l'espai urbà,83438243.0,29547477.0,...,1,Carrer,2017,12,TELÈFON,Doctor Rizal,31.0,5,INCIDENCIA,1


In [35]:
income_df = df2.copy()
income_df.head()

Unnamed: 0,Any,Codi_Districte,Nom_Districte,Codi_Barri,Nom_Barri,Població,Índex RFD Barcelona = 100
0,2015,1,Ciutat Vella,1,el Raval,47617,75.8
1,2015,1,Ciutat Vella,2,el Barri Gòtic,15269,108.5
2,2015,1,Ciutat Vella,3,la Barceloneta,15036,76.6
3,2015,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",22305,96.4
4,2015,2,Eixample,5,el Fort Pienc,31645,104.8


In [36]:
unemployment_df = df3.copy()
unemployment_df.head()

Unnamed: 0,Codi_Districte,Sexe,Nom_Districte,Demanda_ocupació,Nom_Barri,Mes,Codi_Barri,Nombre,_id,Any,Demanda_ocupacio
0,1,Homes,Ciutat Vella,Atur registrat,el Raval,1,1,2107,1,2017,
1,1,Homes,Ciutat Vella,Atur registrat,el Barri Gòtic,1,2,538,2,2017,
2,1,Homes,Ciutat Vella,Atur registrat,la Barceloneta,1,3,537,3,2017,
3,1,Homes,Ciutat Vella,Atur registrat,"Sant Pere, Santa Caterina i la Ribera",1,4,741,4,2017,
4,2,Homes,Eixample,Atur registrat,el Fort Pienc,1,5,630,5,2017,


In [37]:

def quick_eda(df, name="DataFrame"):
    print(f"\n EDA Report for: {name}")
    print("-" * 50)
    
    # Shape
    print(f" Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    
    # Column types
    print("\n Column Types:")
    print(df.dtypes)
    
    # Missing values
    print("\n Missing Values (%):")
    missing = df.isnull().mean() * 100
    print(missing[missing > 0].sort_values(ascending=False))

    # Unique values
    print("\n Unique Values (Top 10):")
    for col in df.columns[:10]:  # only first 10 columns to keep it readable
        print(f"{col}: {df[col].nunique()} unique values")

    # Summary stats for numeric columns
    print("\n Summary Statistics:")
    print(df.describe(include='number').T)

    # Sample values
    print("\n Sample Rows:")
    print(df.sample(min(5, len(df))))  # 5 random rows or less if df is small

    print("-" * 50)


In [38]:
quick_eda(incident_df, name="Incidences")
quick_eda(income_df, name="Income")
quick_eda(unemployment_df, name="Unemployment")



 EDA Report for: Incidences
--------------------------------------------------
 Shape: 900 rows × 26 columns

 Column Types:
CODI_DISTRICTE        float64
DISTRICTE              object
ANY_DATA_TANCAMENT      int64
DETALL                 object
BARRI                  object
FITXA_ID                int64
LATITUD               float64
AREA                   object
COORDENADA_Y          float64
COORDENADA_X          float64
LONGITUD              float64
DIA_DATA_ALTA           int64
CANALS_RESPOSTA        object
NUMERO                float64
ELEMENT                object
SECCIO_CENSAL         float64
MES_DATA_TANCAMENT      int64
TIPUS_VIA              object
ANY_DATA_ALTA           int64
MES_DATA_ALTA           int64
SUPORT                 object
CARRER                 object
CODI_BARRI            float64
_id                     int64
TIPUS                  object
DIA_DATA_TANCAMENT      int64
dtype: object

 Missing Values (%):
LATITUD           56.888889
LONGITUD          56.888889
TI

Dataset: Incident

| KPI                                               | Description                |
| ------------------------------------------------- | -------------------------- |
| Total number of incidences per district           | Measures complaint density |
| Most common type of incidence                     | Find frequent issues       |
| Month with more incidences                        | Normalize for comparison   |



Dataset: Income

| KPI                                                         | Description                      |
| ----------------------------------------------------------- | -------------------------------- |
| Average RFD (Relative Family Income Index) per district     | Measures average income level    |
| Districts with income above/below Barcelona's average (100) | Detect inequality                |
| Population-weighted income index                            | Accounts for district population |


Dataset: Unemployment

| KPI                                          | Description                                   |
| -------------------------------------------- | --------------------------------------------- |
| Unemployment rate by gender                  | Detect gender gap                             |
| Unemployment per district                    | Regional economic pressure                    |
| % change in unemployment from previous years | Trend analysis  |
