# 01 - Inspect and Clean Data

This notebook is used to explore and clean the raw datasets for FindMyBorough.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def load_and_preview(file_path, sheet=None, header=0):
    """
    Load and preview a dataset (.csv or .xls/.xlsx).
    
    Parameters:
    - file_path (str): Path to the dataset file.
    - sheet (int or str, optional): Sheet index or name for Excel files.
    - header (int): Row index to use as column names for Excel/CSV.

    Returns:
    - pd.DataFrame
    """
    print(f"Loading file: {file_path}")
    
    if file_path.endswith(".csv"):
        df = pd.read_csv(file_path, header=header)
        
    elif file_path.endswith(".xls") or file_path.endswith(".xlsx"):
        xls = pd.ExcelFile(file_path)
        print("Available sheets:", xls.sheet_names)
        
        if sheet is None:
            sheet = 0  # Default to first sheet
        
        df = xls.parse(sheet, header=header)
        
    else:
        raise ValueError("Unsupported file type. Use .csv or .xls/.xlsx.")
    
    print("\nPreview:")
    display(df.head())
    print("\nInfo:")
    display(df.info())
    print("\nColumns:", df.columns.tolist())
    
    return df


In [3]:
def standardize_borough_names(series):
    """
    Normalize borough names:
    - Lowercase
    - Strip whitespace
    - Replace '&' with 'and'
    - Replace known variants (only if needed)
    """
    replacements = {
        'westminster city': 'westminster'
    }

    return (
        series
        .astype(str)
        .str.strip()
        .str.lower()
        .str.replace('&', 'and')
        .replace(replacements)
    )

In [4]:
valid_boroughs = [
    'barking and dagenham', 'barnet', 'bexley', 'brent', 'bromley', 'camden', 'city of london',
    'croydon', 'ealing', 'enfield', 'greenwich', 'hackney', 'hammersmith and fulham',
    'haringey', 'harrow', 'havering', 'hillingdon', 'hounslow', 'islington',
    'kensington and chelsea', 'kingston upon thames', 'lambeth', 'lewisham', 'merton',
    'newham', 'redbridge', 'richmond upon thames', 'southwark', 'sutton',
    'tower hamlets', 'waltham forest', 'wandsworth', 'westminster'
]

def filter_valid_boroughs(df, borough_column='borough', valid_boroughs=None):
    """
    Filter a DataFrame to include only valid (normalized) London boroughs.
    
    Parameters:
    - df: pd.DataFrame — the dataset to filter
    - borough_column: str — name of the column containing borough names
    - valid_boroughs: list[str] — list of valid boroughs in lowercase

    Returns:
    - pd.DataFrame — filtered DataFrame
    """
    if valid_boroughs is None:
        raise ValueError("valid_boroughs list is required")

    df[borough_column] = df[borough_column].str.strip().str.lower()
    
    return df[df[borough_column].isin(valid_boroughs)].copy()


## Rental Prices

In [5]:
df_rent = load_and_preview("../data/raw/voa-average-rent-borough.xls", sheet="Raw data", header=2)

Loading file: ../data/raw/voa-average-rent-borough.xls
Available sheets: ['Metadata', 'Summary', 'Pivot Table', 'Raw data']

Preview:


Unnamed: 0,Year,Quarter,Code,Area,Category,Count of rents,Average,Lower quartile,Median,Upper quartile
0,2011,Q2,E09000001,City of London,Room,-,-,-,-,-
1,2011,Q2,E09000002,Barking and Dagenham,Room,92,336,282,347,390
2,2011,Q2,E09000003,Barnet,Room,945,450,399,433,500
3,2011,Q2,E09000004,Bexley,Room,119,390,347,390,433
4,2011,Q2,E09000005,Brent,Room,344,469,390,457,550



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6160 entries, 0 to 6159
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Year            6160 non-null   int64 
 1   Quarter         6160 non-null   object
 2   Code            6160 non-null   object
 3   Area            6160 non-null   object
 4   Category        6160 non-null   object
 5   Count of rents  6160 non-null   object
 6   Average         6160 non-null   object
 7   Lower quartile  6160 non-null   object
 8   Median          6160 non-null   object
 9   Upper quartile  6160 non-null   object
dtypes: int64(1), object(9)
memory usage: 481.4+ KB


None


Columns: ['Year', 'Quarter', 'Code', 'Area', 'Category', 'Count of rents', 'Average', 'Lower quartile', 'Median', 'Upper quartile']


In [6]:
# Filter rows where the property type is 'All categories'
df_all = df_rent[df_rent['Category'] == 'All categories']

# Group by borough and calculate the median rent
df_rent_clean = df_all.groupby('Area')['Median'].median().reset_index()

# Rename columns to match the feature naming convention
df_rent_clean.columns = ['borough', 'avg_rent']

# Normalize borough names and filter only London ones
df_rent_clean['borough'] = standardize_borough_names(df_rent_clean['borough'])
df_rent_clean = filter_valid_boroughs(df_rent_clean, 'borough', valid_boroughs)
df_rent_clean.to_csv("../data/clean/clean_rent.csv", index=False)
df_rent_clean.head()

Unnamed: 0,borough,avg_rent
0,barking and dagenham,900.5
1,barnet,1264.0
2,bexley,850.0
3,brent,1314.0
4,bromley,1025.0


## Crime

In [7]:
df_crime = load_and_preview("../data/raw/MPS Monthly Crime Dashboard_BoroughSNT_TNOCrimeDatafy22-23_03.csv")

Loading file: ../data/raw/MPS Monthly Crime Dashboard_BoroughSNT_TNOCrimeDatafy22-23_03.csv

Preview:


Unnamed: 0,Month_Year,Area Type,Borough_SNT,Area name,Area code,Offence Group,Offence Subgroup,Measure,Financial Year,FY_FYIndex,Count
0,2022-04-01,Borough,Aviation Security(SO18),Aviation Security(SO18),SO18,Arson and Criminal Damage,Criminal Damage,Offences,fy22-23,fy22-23_03,8
1,2022-04-01,Borough,Aviation Security(SO18),Aviation Security(SO18),SO18,Arson and Criminal Damage,Criminal Damage,Outcomes,fy22-23,fy22-23_03,1
2,2022-04-01,Borough,Aviation Security(SO18),Aviation Security(SO18),SO18,Burglary,Burglary Business and Community,Offences,fy22-23,fy22-23_03,1
3,2022-04-01,Borough,Aviation Security(SO18),Aviation Security(SO18),SO18,Burglary,Burglary Business and Community,Outcomes,fy22-23,fy22-23_03,1
4,2022-04-01,Borough,Aviation Security(SO18),Aviation Security(SO18),SO18,Drug Offences,Possession of Drugs,Offences,fy22-23,fy22-23_03,2



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218229 entries, 0 to 218228
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Month_Year        218229 non-null  object
 1   Area Type         218229 non-null  object
 2   Borough_SNT       218229 non-null  object
 3   Area name         218229 non-null  object
 4   Area code         218229 non-null  object
 5   Offence Group     218229 non-null  object
 6   Offence Subgroup  218229 non-null  object
 7   Measure           218229 non-null  object
 8   Financial Year    218229 non-null  object
 9   FY_FYIndex        218229 non-null  object
 10  Count             218229 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 18.3+ MB


None


Columns: ['Month_Year', 'Area Type', 'Borough_SNT', 'Area name', 'Area code', 'Offence Group', 'Offence Subgroup', 'Measure', 'Financial Year', 'FY_FYIndex', 'Count']


In [8]:
# Keep only rows where the measure is 'Offences'
df_crime_filtered = df_crime[df_crime['Measure'] == 'Offences']

# Group by borough and sum total crimes
df_crime_clean = df_crime_filtered.groupby('Area name')['Count'].sum().reset_index()

# Rename columns
df_crime_clean.columns = ['borough', 'total_crimes']

# Normalize borough names and filter only London ones
df_crime_clean['borough'] = standardize_borough_names(df_crime_clean['borough'])
df_crime_clean = filter_valid_boroughs(df_crime_clean, 'borough', valid_boroughs)

# Save to CSV
df_crime_clean.to_csv("../data/clean/clean_crime.csv", index=False)
df_crime_clean.head()

Unnamed: 0,borough,total_crimes
17,barking and dagenham,24903
23,barnet,31986
44,bexley,18776
60,brent,34400
72,bromley,26028


## Merge data

In [10]:
df_features = pd.merge(df_rent_clean, df_crime_clean, on='borough', how='inner')
df_features.to_csv("../data/clean/borough_features.csv", index=False)

# Check for missing values
nulls = df_features[df_features.isnull().any(axis=1)]

if not nulls.empty:
    print("Warning: There are boroughs with missing values:")
    display(nulls)
else:
    print("No missing values found in merged dataset.")

df_features.head()

No missing values found in merged dataset.


Unnamed: 0,borough,avg_rent,total_crimes
0,barking and dagenham,900.5,24903
1,barnet,1264.0,31986
2,bexley,850.0,18776
3,brent,1314.0,34400
4,bromley,1025.0,26028
