# BFRO Sightings EDA

In [281]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [308]:
# Raw data import and examine
df = (pd.read_csv('data/bfro_raw.csv', index_col=0))
df.head()

Unnamed: 0,reportheader,reportclassification,year,season,month,state,county,nearest_town,observed,also_noticed,other_witnesses,other_stories,time_and_conditions,environment,country,province,location_details
0,Report # 13038,(Class A),2004,Winter,February,Alaska,Anchorage County,Anchorage / Hillside,I and two of my friends were bored one night s...,"Some tracks in the snow, and a clearing in the...",My two friends were snowmachining behind me bu...,I have not heard of any other incidents in Anc...,Middle of the night. The only light was the he...,"In the middle of the woods, in a clearing cove...",,,Up near powerline clearings east of Potter Mar...
1,Report # 8792,(Class B),2003,Winter,December,Alaska,Anchorage County,Anchorage,"Me and a couple of friends had been bored, whe...","We smelled of colonge and after shave, and one...","4. Me, w-man, warren and sean. We were at my h...",no,"Started at 11, ended at about 3-3:30. Weather ...","A pine forest, with a bog or swamp on the righ...",,,"Few houses on the way, a power relay station. ..."
2,Report # 1255,(Class B),1998,Fall,September,Alaska,Bethel County,,My hunting buddy and I were sitting on a ridge...,nothing unusual,Scouting for caribou with high quality binoculars,,,Call Iliamna Air taxi for lat & Long of Long L...,,,"45 miles by air west of Lake Iliamna, Alaska i..."
3,Report # 11616,(Class B),2004,Summer,July,Alaska,Bristol Bay County,Egegik,"To whom it may concern, I am a commercial fish...",Just these foot prints and how obvious it was ...,"One other witness, and he was fishing prior to...","I've only heard of one other story, from an ol...","Approximately 12:30 pm, partially coudy/sunny.","Lake front,creek spit, gravel and sand, alder ...",,,"Approximately 95 miles east of Egegik, Alaska...."
4,Report # 637,(Class A),2000,Summer,June,Alaska,Cordova-McCarthy County,"Kennikot, Alaska",My hiking partner and I arrived late to the Ke...,I did hear what appeared to be grunting in the...,"I was the only witness, there was one other in...",,About 12:00 Midnight / full moon / clear / dim...,This sighting was located at approximately 1 t...,,,"On the main trail toward the glacier, before t..."


In [309]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5345 entries, 0 to 5344
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reportheader          5345 non-null   object
 1   reportclassification  5345 non-null   object
 2   year                  5344 non-null   object
 3   season                5345 non-null   object
 4   month                 4701 non-null   object
 5   state                 5076 non-null   object
 6   county                5076 non-null   object
 7   nearest_town          4984 non-null   object
 8   observed              5304 non-null   object
 9   also_noticed          3469 non-null   object
 10  other_witnesses       4748 non-null   object
 11  other_stories         3764 non-null   object
 12  time_and_conditions   4827 non-null   object
 13  environment           5042 non-null   object
 14  country               269 non-null    object
 15  province              269 non-null    objec

In [335]:
def tweak_df(df):
    
    def string_fix(df_):
        df_['reportclassification'] = df_['reportclassification'].replace(['\(', '\)'], '', regex=True)
        df_['reportID'] = df_['reportheader'].str.split('#', expand=True).iloc[:,1]
        df_['nearest_town'] = df_['nearest_town'].str.split(',', expand=True).iloc[:, 0]
        df_['county'] = df_['county'].str.replace('County', '')
        return df_
   
    return (df
            .pipe(string_fix)
            .set_index('reportID')
            .drop('reportheader', axis=1)
            .assign(year=lambda df_: df_['year'].str.extract(r'(\b\d{4}\b)', expand=False))
            .dropna(subset=['year', 'month'])
            #.assign(date=pd.to_datetime(df['year'].astype('str') + '-' + df['month'].astype('str')))
            .astype({'reportclassification':'category', 'year': 'int', 'season':'category',
                     'county':'category', 'state':'category', 'month':'category',
                     'province':'category', 'country':'category'})
            

           )
    # fix dtypes
    # string edits
    # combine month, year and make a date
    # categories
    # remove states from nearest_town
    # extract number from reportheader to report id 
    # pull number from other_witness and create a quantitative field 
    # extract any gps coord from any of the text fields 


In [336]:
df = pd.read_csv('data/bfro_raw.csv', index_col=0)
df = tweak_df(df)
df.head()

Unnamed: 0_level_0,reportclassification,year,season,month,state,county,nearest_town,observed,also_noticed,other_witnesses,other_stories,time_and_conditions,environment,country,province,location_details
reportID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
13038,Class A,2004,Winter,February,Alaska,Anchorage,Anchorage / Hillside,I and two of my friends were bored one night s...,"Some tracks in the snow, and a clearing in the...",My two friends were snowmachining behind me bu...,I have not heard of any other incidents in Anc...,Middle of the night. The only light was the he...,"In the middle of the woods, in a clearing cove...",,,Up near powerline clearings east of Potter Mar...
8792,Class B,2003,Winter,December,Alaska,Anchorage,Anchorage,"Me and a couple of friends had been bored, whe...","We smelled of colonge and after shave, and one...","4. Me, w-man, warren and sean. We were at my h...",no,"Started at 11, ended at about 3-3:30. Weather ...","A pine forest, with a bog or swamp on the righ...",,,"Few houses on the way, a power relay station. ..."
1255,Class B,1998,Fall,September,Alaska,Bethel,,My hunting buddy and I were sitting on a ridge...,nothing unusual,Scouting for caribou with high quality binoculars,,,Call Iliamna Air taxi for lat & Long of Long L...,,,"45 miles by air west of Lake Iliamna, Alaska i..."
11616,Class B,2004,Summer,July,Alaska,Bristol Bay,Egegik,"To whom it may concern, I am a commercial fish...",Just these foot prints and how obvious it was ...,"One other witness, and he was fishing prior to...","I've only heard of one other story, from an ol...","Approximately 12:30 pm, partially coudy/sunny.","Lake front,creek spit, gravel and sand, alder ...",,,"Approximately 95 miles east of Egegik, Alaska...."
637,Class A,2000,Summer,June,Alaska,Cordova-McCarthy,Kennikot,My hiking partner and I arrived late to the Ke...,I did hear what appeared to be grunting in the...,"I was the only witness, there was one other in...",,About 12:00 Midnight / full moon / clear / dim...,This sighting was located at approximately 1 t...,,,"On the main trail toward the glacier, before t..."


In [337]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4663 entries,  13038 to  13061
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   reportclassification  4663 non-null   category
 1   year                  4663 non-null   int64   
 2   season                4663 non-null   category
 3   month                 4663 non-null   category
 4   state                 4421 non-null   category
 5   county                4421 non-null   category
 6   nearest_town          4404 non-null   object  
 7   observed              4630 non-null   object  
 8   also_noticed          3083 non-null   object  
 9   other_witnesses       4181 non-null   object  
 10  other_stories         3379 non-null   object  
 11  time_and_conditions   4292 non-null   object  
 12  environment           4430 non-null   object  
 13  country               242 non-null    category
 14  province              242 non-null    category
 15  lo

In [324]:
df.year.isna().value_counts()

year
False    5278
True       67
Name: count, dtype: int64

0      2004-01-01
1      2003-01-01
2      1998-01-01
3      2004-01-01
4      2000-01-01
          ...    
5340   2001-01-01
5341   1995-01-01
5342   1992-01-01
5343   1995-01-01
5344   1978-01-01
Name: year, Length: 5345, dtype: datetime64[ns]

In [283]:
np.iinfo('int16')

iinfo(min=-32768, max=32767, dtype=int16)