# Pipelines Accidents

Project Plan:

1. Data Aquisition
2. Data Cleaning

In [251]:
# import necessary libraries
import requests
from bs4 import BeautifulSoup as bs
from time import *
import re
from random import randint

In [252]:
# specify the parameters of the browser settings
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}

# create a url list to scrape data from all pages
url = 'https://en.wikipedia.org/wiki/List_of_pipeline_accidents_in_the_United_States'

def get_page(url):
    """
    function to get the page status of the bs object and bs soup object
    """
    page = requests.get(url, headers = header)
    status = page.status_code
    soup = bs(page.text, 'lxml')
    return [soup, status]

soup = get_page(url)[0]

In [253]:
def get_links(soup):
    """
    collect the links fron the individual webpages of wiki
    """
    list_links = []

    for link in soup.findAll('li'):
        if len(link.a['href']) == 61:
            if link.a['href'][-4:] != '2021':
                print(link.a['href'])
                list_links.append(link.a['href'])
            else:
                break
    return list_links

list_links = get_links(soup)

/wiki/List_of_pipeline_accidents_in_the_United_States_in_1970
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1971
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1972
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1973
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1974
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1975
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1976
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1977
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1978
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1979
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1980
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1981
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1982
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1983
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1984
/wiki/List_of_pipeline_accidents_in_the_United_States_in_1985
/wiki/Li

In [254]:
def get_full_url(list_links):
    """
    append base site to individual links
    """
    ready_links = []
    base_url = 'https://en.wikipedia.org'
    for link in list_links:

        full_url = base_url + link

        ready_links.append(full_url)
    return ready_links

ready_links = get_full_url(list_links)

In [255]:
def extract_data(ready_links):
    """
    iterate across links, with delay and get dict:
    name and info columns, create dataframe from this dict
    """
    get_info = []

    i = 0
    for link in ready_links:

        x = randint(9,15)
        print(f'Scraping page {i}')
        print(f'I waited {x} seconds')
        sleep(x)
        
        res = requests.get(link).text
        soup = bs(res, 'lxml')
        info = soup.find('div',{'class':'vector-body'}).get_text().strip()
        name = soup.find('h1',{'class':'firstHeading'}).get_text()

        dic = {
            'info': info,
            'name': name}

        get_info.append(dic)
        i+=1
    df = pd.DataFrame(get_info) 
    df.to_csv('pipelines.csv')
    return df

df = extract_data(ready_links)

Scraping page 0
I waited 15 seconds
Scraping page 1
I waited 15 seconds
Scraping page 2
I waited 11 seconds
Scraping page 3
I waited 13 seconds
Scraping page 4
I waited 9 seconds
Scraping page 5
I waited 12 seconds
Scraping page 6
I waited 14 seconds
Scraping page 7
I waited 11 seconds
Scraping page 8
I waited 10 seconds
Scraping page 9
I waited 14 seconds
Scraping page 10
I waited 15 seconds
Scraping page 11
I waited 13 seconds
Scraping page 12
I waited 10 seconds
Scraping page 13
I waited 9 seconds
Scraping page 14
I waited 14 seconds
Scraping page 15
I waited 13 seconds
Scraping page 16
I waited 13 seconds
Scraping page 17
I waited 9 seconds
Scraping page 18
I waited 9 seconds
Scraping page 19
I waited 13 seconds
Scraping page 20
I waited 15 seconds
Scraping page 21
I waited 13 seconds
Scraping page 22
I waited 13 seconds
Scraping page 23
I waited 12 seconds
Scraping page 24
I waited 13 seconds
Scraping page 25
I waited 15 seconds
Scraping page 26
I waited 13 seconds
Scraping page 2

<IPython.core.display.Javascript object>

# Data Cleaning



Here is the marterial to work with

In [257]:
df['info'][0]

'From Wikipedia, the free encyclopedia\n\n\n\nJump to navigation\nJump to search\nWikipedia list article\n\nThe following is a list of pipeline accidents in the United States in 1970. It is one of several lists of U.S. pipeline accidents. See also: list of natural gas and oil production accidents in the United States.\n\nThis list is incomplete; you can help by adding missing items.  (November 2020)\n\n\nIncidents[edit]\nThis is not a complete list of all pipeline accidents. For natural gas alone, the Pipeline and Hazardous Materials Safety Administration (PHMSA), a United States Department of Transportation agency, has collected data on more than 3,200 accidents deemed serious or significant since 1987.\nA "significant incident" results in any of the following consequences:\n\nFatality or injury requiring in-patient hospitalization.\n$50,000 or more in total costs, measured in 1984 dollars.\nLiquid releases of five or more barrels (42 US gal/barrel).\nReleases resulting in an unintent

Plan for data cleaning:

1. From column Name get only years.
2. Get the number of accidents from info column.
3. Get states in which accident happend.
4. Calculate the property damage in a year.
5. Compute the number of people killed

From column Name get only years

In [258]:
def get_a_year(df):
    """
    extract year info from a column
    """
    df['Year'] = df['name'].str.extract(r'(\d+)')
    return df

df =get_a_year(df) 

Get the number of accidents from info column.

In [259]:
def find_dates(x):
    """
    find unique dates at which incidents occured, search for month and any two digits,
    calculate the number of unique dates
    """
    months = ['January','February','March','April','May','June',
            'July','August','September','October','November','December']
    
    res = re.findall(r"(January\s\d{2})|(February\s\d{2})|(March\s\d{2})|\
                 (April\s\d{2})|(May\s\d{2})|(June\s\d{2})\
                |(July\s\d{2})|(August\s\d{2})|(September\s\d{2})|\
                (October\s\d{2})|(November\s\d{2})|(December\s\d{2})", x)
    dates = []
    for tupl in res:
        for word in tupl:
            if len(word) != 0:
                dates.append(word)
    return len(set(dates))

df['number_incidents'] = df['info'].apply(lambda x: find_dates(x))

get locations of states

In [260]:
def state_search(x):
    
    states = ["Alaska","Alabama","Arkansas","American Samoa","Arizona","California",
          "Colorado","Connecticut","District of Columbia","Delaware","Florida",
          "Georgia","Guam", "Hawaii","Iowa","Idaho","Illinois","Indiana","Kansas",
          "Kentucky","Louisiana","Massachusetts","Maryland","Maine","Michigan",
          "Minnesota","Missouri","Mississippi","Montana","North Carolina","North Dakota",
          "Nebraska","New Hampshire","New Jersey","New Mexico","Nevada","New York","Ohio",
          "Oklahoma","Oregon","Pennsylvania","Puerto Rico","Rhode Island","South Carolina",
          "South Dakota","Tennessee","Texas","Utah","Virginia","Virgin Islands","Vermont",
          "Washington","Wisconsin","West Virginia","Wyoming"]

    state_found = []
    for state in states:
        if state in x:
            state_found.append(state)
    return ','.join(state_found)

df['states_affected'] = df['info'].apply(lambda x: state_search(x))

In [261]:
# property damage
df['info'] = df['info'].str.replace(',','')

Calculate the property damage in a year.

In [262]:
def find_damage(x):
    """
    extract property damage string
    """
    r1 = re.findall(r"(\$\d+\s[a-zA-Z]+?)", x)
    return ','.join(r1)

df['damage'] = df['info'].apply(lambda x: find_damage(x))
df['damage'] = df['damage'].str.lower()

In [263]:
def find_millions(x):
    """
    classify the info: if damage contains millions,
    add 000000, then sum up values
    """
    new_string = ''
    res = ''
    
    splitted = x.split(',')
    for ele in splitted:
        if 'm' in ele:
            ele = ele.replace('m', '000000')
            new_string += ele
        else:
            res += ele
    new_string = res + new_string
    return new_string

df['damage_numbers'] = df['damage'].apply(lambda x: find_millions(x))
df['damage_numbers'] = df['damage_numbers'].str.replace(' ', '').str.replace(r'([a-z]+)','')

In [264]:
def join(x):
    """
    get rid of list datatype
    """
    return ''.join(x)

df['damage_numbers'] = df['damage_numbers'].apply(lambda x: join(x))
df['damage_numbers'] = df['damage_numbers'].str.replace(r'\xa0', '')
df['damage_numbers'] = df['damage_numbers'].str.replace('$', ' ')

In [265]:
def sum_values(x):
    """
    sum up values
    """
    res = x.split()
    summ = 0
    for val in res:
        summ += int(val)
    return summ

df['damage_numbers'] = df['damage_numbers'].apply(lambda x: sum_values(x))

Compute the number of people killed

In [266]:
def fatalities(x):
    """
    find people got killed (string)
    """
    r1 = re.findall(r"killed\s?\d+", x)
    return r1

df['killed'] =  df['info'].apply(lambda x: fatalities(x))

In [267]:
def numbers_killed(x):
    """
    compute the number of killed
    """
    if len(x) == 0:
        return 0
    elif len(x) == 1:
        return int(''.join(x).split()[1])
    else:
        one  = x[0].split( )[1]
        two = x[1].split( )[1]
        return int(one) + int(two)
    
df['numbers_killed'] = df['killed'].apply(lambda x: numbers_killed(x))

In [269]:
def dropping_columns(df):
    """
    drop unnecessary columns
    """
    df.drop(columns = ['info', 'name', 'damage', 'killed'], axis = 1, inplace = True)
    return df

df = dropping_columns(df)

# DataFrame cleaned

In [270]:
df.head(15)

Unnamed: 0,Year,number_incidents,states_affected,damage_numbers,numbers_killed
0,1970,13,"Florida,Kentucky,Louisiana,Maryland,Missouri,M...",50000,20
1,1971,2,"Arkansas,Mississippi,New Jersey,New York,Penns...",50000,0
2,1972,12,"Alabama,Georgia,Iowa,Indiana,Kansas,Louisiana,...",50000,0
3,1973,10,"Alabama,California,Iowa,Idaho,Kansas,Louisiana...",50000,0
4,1974,9,"Illinois,Indiana,Kansas,Kentucky,Louisiana,Min...",50000,0
5,1975,12,"Alaska,Iowa,Louisiana,Michigan,Missouri,North ...",50000,0
6,1976,18,"Alabama,Arizona,California,Illinois,Louisiana,...",50000,16
7,1977,17,"Alaska,Alabama,Arizona,California,Georgia,Iowa...",2050000,3
8,1978,14,"Alaska,Iowa,Indiana,Kansas,Louisiana,Massachus...",50500,0
9,1979,13,"Alaska,Florida,Iowa,Indiana,Kentucky,Louisiana...",3050000,7
