In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
url = "https://www.numbeo.com/crime/rankings_current.jsp"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

In [4]:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

In [16]:
table = soup.find('table', {'id': 't2'})
table

<table class="stripe row-border order-column compact" id="t2">
<thead>
<tr>
<th><div style="font-size: 80%; vertical-align: middle;">Rank</div></th>
<th><div style="font-size: 95%;">City</div></th>
<th><div style="font-size: 90%;">Crime Index</div></th>
<th><div style="font-size: 90%;">Safety Index</div></th>
</tr>
</thead>
<tbody>
<tr style="width: 100%">
<td></td>
<td class="cityOrCountryInIndicesTable"><a class="discreet_link" href="https://www.numbeo.com/crime/in/Pietermaritzburg">Pietermaritzburg, South Africa</a></td>
<td style="text-align: right">82.8</td>
<td style="text-align: right">17.2</td>
</tr>
<tr style="width: 100%">
<td></td>
<td class="cityOrCountryInIndicesTable"><a class="discreet_link" href="https://www.numbeo.com/crime/in/Pretoria">Pretoria, South Africa</a></td>
<td style="text-align: right">81.9</td>
<td style="text-align: right">18.1</td>
</tr>
<tr style="width: 100%">
<td></td>
<td class="cityOrCountryInIndicesTable"><a class="discreet_link" href="https://www.

In [18]:
headers = [th.text.strip() for th in table.find_all('th')]
headers

['Rank', 'City', 'Crime Index', 'Safety Index']

In [21]:
# Extract rows
rows = []
for tr in table.find_all('tr')[1:]:  # skip header row
    cells = [td.text.strip() for td in tr.find_all('td')]
    if cells:
        rows.append(cells)

rows

[['', 'Pietermaritzburg, South Africa', '82.8', '17.2'],
 ['', 'Pretoria, South Africa', '81.9', '18.1'],
 ['', 'Caracas, Venezuela', '81.4', '18.6'],
 ['', 'Port Moresby, Papua New Guinea', '81.3', '18.7'],
 ['', 'Johannesburg, South Africa', '80.9', '19.1'],
 ['', 'Durban, South Africa', '80.4', '19.6'],
 ['', 'San Pedro Sula, Honduras', '79.4', '20.6'],
 ['', 'Memphis, TN, United States', '78.6', '21.4'],
 ['', 'Port Elizabeth, South Africa', '78.5', '21.5'],
 ['', 'Salvador, Brazil', '76.5', '23.5'],
 ['', 'Port of Spain, Trinidad And Tobago', '76.4', '23.6'],
 ['', 'Fortaleza, Brazil', '75.9', '24.1'],
 ['', 'Rio de Janeiro, Brazil', '75.2', '24.8'],
 ['', 'Recife, Brazil', '74.8', '25.2'],
 ['', 'Guayaquil, Ecuador', '74.4', '25.6'],
 ['', 'Cape Town, South Africa', '73.7', '26.3'],
 ['', 'Detroit, MI, United States', '72.8', '27.2'],
 ['', 'Baltimore, MD, United States', '72.1', '27.9'],
 ['', 'Cali, Colombia', '72.0', '28.0'],
 ['', 'Tijuana, Mexico', '71.5', '28.5'],
 ['', 'Al

In [23]:
# Convert to DataFrame
df = pd.DataFrame(rows, columns=headers)
df.head()

Unnamed: 0,Rank,City,Crime Index,Safety Index
0,,"Pietermaritzburg, South Africa",82.8,17.2
1,,"Pretoria, South Africa",81.9,18.1
2,,"Caracas, Venezuela",81.4,18.6
3,,"Port Moresby, Papua New Guinea",81.3,18.7
4,,"Johannesburg, South Africa",80.9,19.1


# Preprocessing scrapped data
## Adding Country and continent

In [24]:
# Extracting Country
df['Country'] = df['City'].apply(lambda x: x.split(',')[-1].strip())

In [None]:
# Determining continent
!pip install pycountry_convert
import pycountry_convert as pc

def country_to_continent(country_name):
    try:
        country_code = pc.country_name_to_country_alpha2(country_name)
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except:
        return 'Unknown'

df['Continent'] = df['Country'].apply(country_to_continent)


Collecting pycountry_convert
  Downloading pycountry_convert-0.7.2-py3-none-any.whl.metadata (7.2 kB)
Collecting pprintpp>=0.3.0 (from pycountry_convert)
  Downloading pprintpp-0.4.0-py2.py3-none-any.whl.metadata (7.9 kB)
Collecting pycountry>=16.11.27.1 (from pycountry_convert)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting pytest>=3.4.0 (from pycountry_convert)
  Downloading pytest-9.0.2-py3-none-any.whl.metadata (7.6 kB)
Collecting pytest-mock>=1.6.3 (from pycountry_convert)
  Downloading pytest_mock-3.15.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pytest-cov>=2.5.1 (from pycountry_convert)
  Downloading pytest_cov-7.0.0-py3-none-any.whl.metadata (31 kB)
Collecting repoze.lru>=0.7 (from pycountry_convert)
  Downloading repoze.lru-0.7-py3-none-any.whl.metadata (1.1 kB)
Collecting iniconfig>=1.0.1 (from pytest>=3.4.0->pycountry_convert)
  Downloading iniconfig-2.3.0-py3-none-any.whl.metadata (2.5 kB)
Collecting pluggy<2,>=1.5 (from pytest>=3.4.0->pycou

In [26]:
df.head()

Unnamed: 0,Rank,City,Crime Index,Safety Index,Country,Continent
0,,"Pietermaritzburg, South Africa",82.8,17.2,South Africa,Africa
1,,"Pretoria, South Africa",81.9,18.1,South Africa,Africa
2,,"Caracas, Venezuela",81.4,18.6,Venezuela,South America
3,,"Port Moresby, Papua New Guinea",81.3,18.7,Papua New Guinea,Oceania
4,,"Johannesburg, South Africa",80.9,19.1,South Africa,Africa


In [28]:
def crime_index_countries(continent = None, num_countries = 10, ascending = False):
    """ 
    This function returns a DataFrame of countries with their Crime Index and Continent.
    It can filter by continent and limit the number of countries returned.

    Parameters:
    continent (str): The continent to filter by (e.g., 'Europe', 'Asia'). If None, no filtering is applied.
    num_countries (int): The number of countries to return.
    ascending (bool): Whether to sort the Crime Index in ascending order.

    Returns:
    pd.DataFrame: A DataFrame with columns 'Country', 'Crime Index', and 'Continent'.
    """

    # === Scrapping from the web page ===============================================
    url = "https://www.numbeo.com/crime/rankings_current.jsp"
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        }
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    table = soup.find('table', {'id': 't2'})
    headers = [th.text.strip() for th in table.find_all('th')]

    rows = []
    for tr in table.find_all('tr')[1:]:  # skip header row
        cells = [td.text.strip() for td in tr.find_all('td')]
        if cells:
            rows.append(cells)

    # === Converting to DataFrame ===================================================
    df = pd.DataFrame(rows, columns=headers)
    df['Country'] = df['City'].apply(lambda x: x.split(',')[-1].strip())

    # === Determining continent =====================================================
    df['Continent'] = df['Country'].apply(country_to_continent)

    # === Filtering and sorting =====================================================
    if continent:
        df = df[df['Continent'] == continent]
    df['Crime Index'] = pd.to_numeric(df['Crime Index'], errors='coerce')
    df = df.sort_values(by='Crime Index', ascending=ascending).head(num_countries)   
    
    return df[['City', 'Crime Index']]

In [29]:
## Example usage:
result_df = crime_index_countries(continent='Europe', num_countries=5, ascending=True)
print(result_df)

                                  City  Crime Index
394                   Craiova, Romania         17.4
392  The Hague (Den Haag), Netherlands         19.9
391             Eindhoven, Netherlands         20.5
390                   Tampere, Finland         20.6
389                    Munich, Germany         21.0
