# UFO Data Scraping
### source: www.nuforc.org

This script will scrape ufo observation data from the NUFORC website. The script was executed on a Microsoft Azure cloud server and takes approximately 10 hours to complete. 

In [166]:
import requests as req
import pandas as pd
from bs4 import BeautifulSoup

First we define a class object to parse the html table from each web page

In [16]:
class HTMLTableParser:

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

        return df

With the HTMLTableParser class defined, we then go through all of the possible webpages (one per month), and within each monthly webpage, the script will find the clickthrough links to get the detailed descriptions of the observations. 

In [None]:

baseURL = "http://www.nuforc.org/webreports/"
monthrange = ['01','02','03','04','05','06','07','08','09','10','11','12']
yearrange = range(1338,2019)

data = pd.DataFrame()
for year in tq(yearrange):
    for month in monthrange:
        url = baseURL + "ndxe" + str(year) + month + ".html"
        r = req.get(url)
        if r.status_code == 404:
            pass
        else:
            c = r.text
            soup = BeautifulSoup(c, 'html.parser')
            html_table = soup.find_all('table')[0]
            parser = HTMLTableParser()
            table = parser.parse_html_table(html_table)
            table['year'] = year
            table['month'] = month
            links = html_table.find_all('a')
            descriptions = []
            for link in links:
                try:
                    ext = link.attrs['href']
                    singleURL = baseURL + ext
                    r = req.get(singleURL)
                    c = r.text
                    soup = BeautifulSoup(c, 'html.parser')
                    html_table = soup.find_all('table')[0]
                    parser = HTMLTableParser()
                    singleTable = parser.parse_html_table(html_table)
                    descriptions.append(singleTable.loc[[1]]['Sighting Report'])
                except:
                    descriptions.append("")
            table['Desc'] = descriptions
            data = data.append(table)
# data = data.dropna(subset=['Date / Time','City']).drop(0,axis=1)
data.to_csv('ufo_data_fullDescs.csv', encoding='utf-8')