* Load UFO data
    * Deal with messed up lines
    
```py
inpath = 'data/ufo/ufo_awesome.tsv'
inf = open(inpath, 'r')

for i, line in enumerate(inf):
    splitline = line.split('\\t')
    if len(splitline) != 6:
        first_bad_line = splitline
        print "First bad row:", i
        for j, col in enumerate(first_bad_line):
            print j, col
        break

inf.close()
```

* deal with headers
* convert dates and index
* split city and state
* get a histogram

```py
post90_count = ufo_us.groupby('date_occurred')['date_occurred'].count()
plt.figure()
post90_count.plot()
plt.title('Number of U.S. UFO sightings\\nJanuary 1990 through August 2010')
plt.savefig('post90_count_ts.png')
```

* Reshape data: http://slendermeans.org/ml4h-ch1-p4.html


In [None]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
col_names = ["date_occurred", "date_reported", "location", "short_desc", "duration", "description"]
ufo = pd.read_csv("ufo_awesome.tsv", sep="\t", names=col_names, dtype=str)

In [None]:
ufo.head()

In [None]:
ufo.tail()

In [None]:
ufo[ufo['date_occurred'] == "0000"]['date_occurred'] = np.nan

In [None]:
ufo.loc[ufo['date_occurred'] == "0000", 'date_occurred'] = np.nan

In [None]:
ufo[ufo['date_occurred'].isnull()].head()

In [None]:
pd.to_datetime(ufo['date_occurred'], format="%Y%m%d")

In [None]:
ufo.index = pd.to_datetime(ufo['date_reported'], format="%Y%m%d")

In [None]:
ufo.head()

In [None]:
ufo = ufo[["date_occurred", "location", "duration"]]

In [None]:
ufo.head()

In [None]:
import re

In [None]:
location_regex = re.compile(r"\s*(.+), ([A-Z]{2})\s*$")

In [None]:
print(location_regex.match("Jason Aylward"))

In [None]:
m = location_regex.match("Durham, NC")

In [None]:
m.groups()

In [None]:
location_regex.match("Los Alamos, NM").groups()

In [None]:
location_regex.match("Coeur d'Alene, ID").groups()

In [None]:
location_regex.match("Los Alamos, NMX").groups()

In [None]:
location_regex.match("Los Alamos, NM, USA").groups()

In [None]:
def extract_city_state(loc):
    location_regex = re.compile(r"\s*(.+), ([A-Z]{2})\s*$")
    try:
        match = location_regex.match(loc)
    except TypeError:
        match = None
        
    if match:
        city, state = match.groups()
    else:
        city, state = (None, None)
        
    return city, state        

In [None]:
locations = ufo.location.map(extract_city_state)

In [None]:
locations[:5]

In [None]:
list(locations)[:5]

In [None]:
ufo["city"] = [city for city, state in locations]
ufo["state"] = [state for city, state in locations]
ufo.head()

In [None]:
ufo = ufo[["date_occurred", "city", "state"]]

In [None]:
ufo.tail()

In [None]:
help(ufo.reindex)

In [None]:
ufo.date_occurred.groupby(ufo.index).count()

In [None]:
ufo.date_occurred.groupby(ufo.index).count().plot()

In [None]:
post1990 = ufo[ufo >= datetime.date(1990, 1, 1)]

In [None]:
post1990.head()

In [None]:
post1990.date_occurred.groupby(post1990.index).count()

In [None]:
post1990 = post1990.drop(datetime.date(1905, 6, 23), axis=0)

In [None]:
post1990.date_occurred.groupby(post1990.index).count().plot()

In [None]:
post1990.resample("Q", how=["count"]).plot()

In [None]:
post1990.index.to_series().map(lambda d: d.weekday())

In [None]:
dow = ["Su", "M", "Tu", "W", "Th", "F", "Sa"]
post1990["day_of_week"] = post1990.index.to_series().map(lambda d: d.weekday())

In [None]:
post1990.head()

In [None]:
data = post1990.day_of_week.value_counts().sort_index()

In [None]:
data.index = dow

In [None]:
data.head()

In [None]:
data.plot(ylim=(0, 10000))

In [None]:
data.plot(kind="pie")

In [None]:
type(data)

In [None]:
isinstance(data, np.ndarray)

In [None]:
np.array(data)