# Project plan:

1. Data Aquisition
2. Data Cleaning

# Data Aquisition

In [171]:
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

In [172]:
wikiurl="https://en.wikipedia.org/wiki/List_of_elevation_extremes_by_country"

def checks_response_status(wikiurl):
    """
    get the response in the form of html
    """
    table_class="wikitable sortable jquery-tablesorter"
    response=requests.get(wikiurl)
    print(response.status_code)
    return response

response = checks_response_status(wikiurl)

200


In [173]:
def scrape_data(response):
    """
    parse data from the html into a beautifulsoup object
    """
    soup = BeautifulSoup(response.text, 'html.parser')
    indiatable=soup.find('table',{'class':"wikitable"})
    df=pd.read_html(str(indiatable))
    # convert list to dataframe
    df=pd.DataFrame(df[0])
    return df

df = scrape_data(response)

In [174]:
df.head(10)

Unnamed: 0,Country or region,Highest point,Maximum elevation,Lowest point,Minimum elevation,Elevation span
0,Afghanistan,Noshakh,"24,580 ft",Amu Darya,846 ft,"23,734 ft"
1,Albania,Korab,"9,068 ft",Adriatic Sea,,"9,068 ft"
2,Algeria,Mount Tahat,"9,852 ft",Chott Melrhir,−40 m−131 ft,"9,984 ft"
3,American Samoa,Lata Mountain on Ta‘ū,"3,169 ft",South Pacific Ocean,,"3,169 ft"
4,Andorra,Coma Pedrosa,"9,652 ft",Gran Valira,"2,756 ft","6,896 ft"
5,Angola,Mount Moco,"8,596 ft",South Atlantic Ocean,,"8,596 ft"
6,Anguilla,Crocus Hill,213 ft,Caribbean Sea,,213 ft
7,Antarctica,Mount Vinson,"16,050 ft","Deep Lake, Vestfold Hills[1][citation needed]",−50 m−164 ft,"16,214 ft"
8,Antigua and Barbuda,Boggy Peak on Antigua,"1,319 ft",Caribbean Sea,,"1,319 ft"
9,Argentina,Aconcagua[2][3],"22,835 ft",Laguna del Carbón[4],−105 m−344 ft,"23,179 ft"


# Data Cleaning

Cleaning Plan:

0. Add underscore to columns' names.
1. Convert Maximum elevation values to int, covert ft to meters.
2. Convert Minimum  elevation values to int, if see level, then replace ot with 0, covert ft to meters.
3. Check if elevation span is correct by creating a bool columns True/False
4. Create a bool column that is True for countries with the elevation more than 6000 m.
5. Get the 5 countries with largest elevation span difference.

In [175]:
df.columns = [i.replace(' ', '_') for i in df.columns]

In [176]:
def convert_elevation(df, col):
    """
    extracts numbers from a string
    """
    df[col] = df[col].str.replace(',', '')
    df[col] = df[col].str.extract(r'(\d+)')
    df[col] = pd.to_numeric(df[col], errors = 'coerce')
    return df

df = convert_elevation(df, 'Maximum_elevation')
df = convert_elevation(df, 'Elevation_span')

In [177]:
def check_elevation_span(x):
    """
    checks None values, converts them to zero
    """
    if x == None:
        return 0
    else:
        return int(x)

In [178]:
def min_elevations(df):
    """
    clean min_elevation column, extract feets
    """
    df['Minimum_elevation'] = df['Minimum_elevation'].str.replace(',', '')
    df['Minimum_elevation'] = df['Minimum_elevation'].str.extract(r'(−?\d+)\sft')
    df['Minimum_elevation']  = df['Minimum_elevation'].fillna(0)
    df['Minimum_elevation']  = df['Minimum_elevation'].astype(str)
    return df

df = min_elevations(df)

In [179]:
def check_negative_values(x):
    """
    lookups for values/ elevation that less than zero and returns negative int from a string
    """
    if '−' in x:
        x = x.replace('−', '')
        return -int(x)
    else:
        return int(x)
    
df['Minimum_elevation'] = df['Minimum_elevation'].apply(lambda x: check_negative_values(x))
df['check_elevation_span'] = df['Maximum_elevation'] - df['Minimum_elevation']

In [180]:
def compare_elevations(df):
    """
    check whether elevation difference are the same as for Elevation_span
    """
    df['compare_elevations'] = df['Elevation_span'] == df['check_elevation_span']
    df.drop('check_elevation_span', axis = 1, inplace = True)
    return df

df = compare_elevations(df)

In [181]:
def ft_to_meters(df):
    """
    converts ft to meters for the follwong columns
    """
    columns_to_convert = ['Maximum_elevation', 'Minimum_elevation', 'Elevation_span']
    for col in columns_to_convert:
        df[col] = df[col].apply(lambda x: x*0.3048)
    return df

df = ft_to_meters(df)

In [182]:
df['6000_more'] = df['Maximum_elevation'] > 6000
df.drop('Highest_point', axis = 1, inplace = True)
df.head()

Unnamed: 0,Country_or_region,Maximum_elevation,Lowest_point,Minimum_elevation,Elevation_span,compare_elevations,6000_more
0,Afghanistan,7491.984,Amu Darya,257.8608,7234.1232,True,True
1,Albania,2763.9264,Adriatic Sea,0.0,2763.9264,True,False
2,Algeria,3002.8896,Chott Melrhir,-39.9288,3043.1232,False,False
3,American Samoa,965.9112,South Pacific Ocean,0.0,965.9112,True,False
4,Andorra,2941.9296,Gran Valira,840.0288,2101.9008,True,False
