In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import agate

In [3]:
%%bash
## download the mixedbev file
curl -L -O https://comptroller.texas.gov/auto-data/odc/MIXEDBEV_02_2017.CSV

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2565k  100 2565k    0     0  1379k      0  0:00:01  0:00:01 --:--:-- 1414k


In [6]:
%%bash
head -n 5 MIXEDBEV_02_2017.CSV

"MB821424    ","ABI-HAUS                      ","959 N 2ND ST                  ","ABILENE             ","TX","79601","221","          ","2016/12", 000000637.97
"MB638028    ","ABILENE BEEHIVE INC           ","442 CEDAR ST STE A            ","ABILENE             ","TX","79601","221","          ","2017/01", 000002557.12
"MB543114    ","ABILENE BOWLING LANES INC     ","279 RUIDOSA AVE               ","ABILENE             ","TX","79605","221","          ","2017/01", 000000287.49
"MB933130    ","ABILENE CABARET LLC           ","1918 BUTTERNUT ST             ","ABILENE             ","TX","79602","221","          ","2017/01", 000000988.04
"N 037863    ","ABILENE COUNTRY CLUB          ","4039 S TREADAWAY BLVD         ","ABILENE             ","TX","79602","221","          ","2017/01", 000002068.82


In [7]:
file = 'MIXEDBEV_02_2017.CSV'
column_names = [
    'TABC Permit Number',
    'Trade Name',
    'Location Address',
    'Location City',
    'Location State',
    'Location Zip Code',
    'Location County Code',
    'Blank',
    'Report Period',
    'Report Tax'
]
specified_types = {
    'Location Zip Code': agate.Text(),
    'Location County Code': agate.Text()
}

In [8]:
mixbev_raw = agate.Table.from_csv(file, column_names, encoding='iso-8859-1', column_types=specified_types)

In [9]:
print(mixbev_raw)

| column               | data_type |
| -------------------- | --------- |
| TABC Permit Number   | Text      |
| Trade Name           | Text      |
| Location Address     | Text      |
| Location City        | Text      |
| Location State       | Text      |
| Location Zip Code    | Text      |
| Location County Code | Text      |
| Blank                | Boolean   |
| Report Period        | Text      |
| Report Tax           | Number    |



In [10]:
# This creates a new interim table with results of compute function that
# that takes the four columns that need trimming and strips them
# adding them to the end of the table with new names
mixbev_trim = mixbev_raw.compute([
    ('Permit', agate.Formula(agate.Text(), lambda r: r['TABC Permit Number'].strip())),
    ('Name', agate.Formula(agate.Text(), lambda r: r['Trade Name'].strip())),
    ('Address', agate.Formula(agate.Text(), lambda r: r['Location Address'].strip())),
    ('City', agate.Formula(agate.Text(), lambda r: r['Location City'].strip()))
])

In [11]:
## shows the new columns added to the interim table
print(mixbev_trim)

| column               | data_type |
| -------------------- | --------- |
| TABC Permit Number   | Text      |
| Trade Name           | Text      |
| Location Address     | Text      |
| Location City        | Text      |
| Location State       | Text      |
| Location Zip Code    | Text      |
| Location County Code | Text      |
| Blank                | Boolean   |
| Report Period        | Text      |
| Report Tax           | Number    |
| Permit               | Text      |
| Name                 | Text      |
| Address              | Text      |
| City                 | Text      |



In [12]:
## creates new table with just stuff we need with clean names
# new_table = table.select(['3rd_column_name', '1st_column_name', '2nd_column_name'])
mixbev_cleaned = mixbev_trim.select([
    'Permit',
    'Name',
    'Address',
    'City',
    'Location State',
    'Location County Code',
    'Report Period',
    'Report Tax'
]).rename(column_names = {
    'Location State': 'State',
    'Location County Code': 'CountyCode',
    'Report Period': 'Period',
    'Report Tax': 'Tax'
})

In [13]:
## these are now the columns present in our new column
print(mixbev_cleaned)

| column     | data_type |
| ---------- | --------- |
| Permit     | Text      |
| Name       | Text      |
| Address    | Text      |
| City       | Text      |
| State      | Text      |
| CountyCode | Text      |
| Period     | Text      |
| Tax        | Number    |



In [14]:
# and this peeks at the data
# I did send this to_csv and made sure columns were trimmed
mixbev_cleaned.limit(5).print_table()

| Permit   | Name                 | Address              | City    | State | CountyCode | ... |
| -------- | -------------------- | -------------------- | ------- | ----- | ---------- | --- |
| MB638028 | ABILENE BEEHIVE INC  | 442 CEDAR ST STE A   | ABILENE | TX    | 221        | ... |
| MB543114 | ABILENE BOWLING L... | 279 RUIDOSA AVE      | ABILENE | TX    | 221        | ... |
| MB933130 | ABILENE CABARET LLC  | 1918 BUTTERNUT ST    | ABILENE | TX    | 221        | ... |
| N 037863 | ABILENE COUNTRY CLUB | 4039 S TREADAWAY ... | ABILENE | TX    | 221        | ... |
| MB200506 | ABILENE SEAFOOD T... | 1882 S CLACK ST      | ABILENE | TX    | 221        | ... |


In [15]:
# importing countes.csv, ensuring that the 'code' column is text
counties = agate.Table.from_csv('counties.csv', column_types={'code': agate.Text()}).rename()

In [16]:
print(counties)

| column | data_type |
| ------ | --------- |
| id     | Number    |
| county | Text      |
| code   | Text      |



In [17]:
counties.print_table()

| id | county    | code |
| -- | --------- | ---- |
|  1 | Anderson  | 001  |
|  2 | Andrews   | 002  |
|  3 | Angelina  | 003  |
|  4 | Aransas   | 004  |
|  5 | Archer    | 005  |
|  6 | Armstrong | 006  |
|  7 | Atascosa  | 007  |
|  8 | Austin    | 008  |
|  9 | Bailey    | 009  |
| 10 | Bandera   | 010  |
| 11 | Bastrop   | 011  |
| 12 | Baylor    | 012  |
| 13 | Bee       | 013  |
| 14 | Bell      | 014  |
| 15 | Bexar     | 015  |
| 16 | Blanco    | 016  |
| 17 | Borden    | 017  |
| 18 | Bosque    | 018  |
| 19 | Bowie     | 019  |
| 20 | Brazoria  | 020  |
| ... | ...       | ...  |


In [18]:
mixbev_joined = mixbev_cleaned.join(counties, 'CountyCode', 'code')

In [19]:
print(mixbev_joined)

| column     | data_type |
| ---------- | --------- |
| Permit     | Text      |
| Name       | Text      |
| Address    | Text      |
| City       | Text      |
| State      | Text      |
| CountyCode | Text      |
| Period     | Text      |
| Tax        | Number    |
| id         | Number    |
| county     | Text      |



In [24]:
mixbev = mixbev_joined.select([
    'Permit',
    'Name',
    'Address',
    'City',
    'State',
    'county',
    'Period',
    'Tax'
]).rename(column_names = {
    'county': 'County'
})

In [25]:
print(mixbev)

| column  | data_type |
| ------- | --------- |
| Permit  | Text      |
| Name    | Text      |
| Address | Text      |
| City    | Text      |
| State   | Text      |
| County  | Text      |
| Period  | Text      |
| Tax     | Number    |



In [22]:
mixbev.print_table()

| Permit   | Name                 | Address              | City    | State | County | ... |
| -------- | -------------------- | -------------------- | ------- | ----- | ------ | --- |
| MB638028 | ABILENE BEEHIVE INC  | 442 CEDAR ST STE A   | ABILENE | TX    | Taylor | ... |
| MB543114 | ABILENE BOWLING L... | 279 RUIDOSA AVE      | ABILENE | TX    | Taylor | ... |
| MB933130 | ABILENE CABARET LLC  | 1918 BUTTERNUT ST    | ABILENE | TX    | Taylor | ... |
| N 037863 | ABILENE COUNTRY CLUB | 4039 S TREADAWAY ... | ABILENE | TX    | Taylor | ... |
| MB200506 | ABILENE SEAFOOD T... | 1882 S CLACK ST      | ABILENE | TX    | Taylor | ... |
| MB541702 | ABUELO'S BEVERAGE... | 4782 S 14TH ST       | ABILENE | TX    | Taylor | ... |
| MB932373 | ACE IN THE HOLE      | 133 EPLENS CT        | ABILENE | TX    | Taylor | ... |
| MB248134 | BILLIARDS PLUS       | 5495 S 7TH ST        | ABILENE | TX    | Taylor | ... |
| MB685388 | BONZAI JAPANESE S... | 1802 S CLACK ST      | ABILENE | TX    | Tay

### Looking at dates of the records

Here we have to:
- create a tableset using group_by by the period
- creaet a table using aggregate function to count
- create a table to sort the period in reverse order
- Then print the sorted table

In [38]:
by_period = mixbev.group_by('Period')

period_totals = by_period.aggregate([
    ('count', agate.Count())
])

period_totals_sorted = period_totals.order_by('Period', reverse=True)

period_totals_sorted.print_table(max_rows=None)


| Period  |  count |
| ------- | ------ |
| 2017/03 |      1 |
| 2017/02 |     33 |
| 2017/01 | 14,317 |
| 2016/12 |  1,573 |
| 2016/11 |    198 |
| 2016/10 |     71 |
| 2016/09 |     28 |
| 2016/08 |     17 |
| 2016/07 |     10 |
| 2016/06 |      7 |
| 2016/05 |      6 |
| 2016/04 |      7 |
| 2016/03 |      9 |
| 2016/02 |      6 |
| 2016/01 |      5 |
| 2015/12 |      3 |
| 2015/11 |      2 |
| 2015/10 |      4 |
| 2015/09 |      2 |
| 2015/08 |      2 |
| 2015/07 |      1 |
| 2015/06 |      3 |
| 2015/05 |      1 |
| 2015/04 |      2 |
| 2015/03 |      1 |
| 2015/02 |      1 |
| 2014/12 |      1 |
| 2014/08 |      1 |
| 2014/06 |      1 |
| 2014/05 |      1 |
| 2014/04 |      1 |
| 2014/03 |      1 |
| 2014/02 |      2 |
| 2014/01 |      1 |


We have an answer here that we need for the future, and that is the period of time that has the most records. If we are studying a particular month of records, we need to set that so we can change it later with a different dataset.

In [39]:
# setting the month_studied var:
month_studied = '2017/01'

Now we can filter or select the records where the month equals what we want.

In [46]:
mixbev_month = mixbev.where(lambda row: row['Period'] == month_studied)
len(mixbev_month)

14317