#### What I'm doing:
- Merging data entries with the same date/location/species to make full counts of each mosquitos per species rather than maxing counts out at 50
- Dataset has already been worked on by Emma G, I'm just doing a little bit extra

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from datetime import datetime
from datetime import timedelta

%matplotlib inline

In [2]:
train_csv = "./Emma/train_weather_per_station_extra.csv"
train = pd.read_csv(train_csv)

In [3]:
pd.set_option('display.max_columns', 500)
train.head()

Unnamed: 0,Date,Species,AddressNumberAndStreet,Latitude,Longitude,NumMosquitos,WnvPresent,Year,Month,Day,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir
0,2007-05-29,CULEX PIPIENS/RESTUANS,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,1,0,2007,5,29,0,0,1,0,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
1,2007-05-29,CULEX RESTUANS,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,1,0,2007,5,29,0,0,0,1,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2,2007-05-29,CULEX RESTUANS,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,1,0,2007,5,29,0,0,0,1,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
3,2007-05-29,CULEX PIPIENS/RESTUANS,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,1,0,2007,5,29,0,0,1,0,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
4,2007-05-29,CULEX RESTUANS,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,4,0,2007,5,29,0,0,0,1,0,0,0,88,60,74,10,58,65,9,421,1917,0.0,5.8,18


In [4]:
# first, remove dummies
species = ["CULEX ERRATICUS","CULEX PIPIENS", "CULEX PIPIENS/RESTUANS", "CULEX RESTUANS", "CULEX SALINARIUS", "CULEX TARSALIS", "CULEX TERRITANS"]
train.drop(labels=species, axis=1, inplace=True)

In [5]:
# next, make a new dataframe that has summed totals of mosquitos
new = train.groupby(by=["Date","AddressNumberAndStreet", "Species"], sort=False).sum()
new.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Latitude,Longitude,NumMosquitos,WnvPresent,Year,Month,Day,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir
Date,AddressNumberAndStreet,Species,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-29,"4100 N OAK PARK AVE, Chicago, IL",CULEX PIPIENS/RESTUANS,41.95469,-87.800991,1,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,"4100 N OAK PARK AVE, Chicago, IL",CULEX RESTUANS,41.95469,-87.800991,1,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,"6200 N MANDELL AVE, Chicago, IL",CULEX RESTUANS,41.994991,-87.769279,1,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,"7900 W FOSTER AVE, Chicago, IL",CULEX PIPIENS/RESTUANS,41.974089,-87.824812,1,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2007-05-29,"7900 W FOSTER AVE, Chicago, IL",CULEX RESTUANS,41.974089,-87.824812,4,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18


In [6]:
# now i'm removing all duplicates on date and location from the *main* dataframe

# I'm assuming that for each location at a given time, everything will be recorded equally except mosquito
# count and WNV presence

train.drop_duplicates(subset=["Date", "AddressNumberAndStreet", "Species"], keep="first", inplace=True)
train.head()

Unnamed: 0,Date,Species,AddressNumberAndStreet,Latitude,Longitude,NumMosquitos,WnvPresent,Year,Month,Day,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir
0,2007-05-29,CULEX PIPIENS/RESTUANS,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,1,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
1,2007-05-29,CULEX RESTUANS,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,1,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
2,2007-05-29,CULEX RESTUANS,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,1,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
3,2007-05-29,CULEX PIPIENS/RESTUANS,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,1,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18
4,2007-05-29,CULEX RESTUANS,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,4,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18


In [7]:
print(new.shape)
print(train.shape)

# shapes look good

(8475, 19)
(8475, 22)


In [8]:
# now i'm going to drop the counts from the train dataframe and replace them with the counts from the new frame
train.drop(labels="NumMosquitos", axis=1, inplace=True)

mosqcountlist = new.NumMosquitos.tolist()

train["NumMosquitos"]= mosqcountlist

In [9]:
train.head(10)

Unnamed: 0,Date,Species,AddressNumberAndStreet,Latitude,Longitude,WnvPresent,Year,Month,Day,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir,NumMosquitos
0,2007-05-29,CULEX PIPIENS/RESTUANS,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,1
1,2007-05-29,CULEX RESTUANS,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,1
2,2007-05-29,CULEX RESTUANS,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,1
3,2007-05-29,CULEX PIPIENS/RESTUANS,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,1
4,2007-05-29,CULEX RESTUANS,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,4
5,2007-05-29,CULEX RESTUANS,"1500 W WEBSTER AVE, Chicago, IL",41.9216,-87.666455,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,2
6,2007-05-29,CULEX RESTUANS,"2500 W GRAND AVE, Chicago, IL",41.891118,-87.654491,0,2007,5,29,88,65,77,10,59,66,12,421,1917,0.0,5.8,16,1
7,2007-05-29,CULEX PIPIENS/RESTUANS,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,0,2007,5,29,88,65,77,10,59,66,12,421,1917,0.0,5.8,16,1
8,2007-05-29,CULEX RESTUANS,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,0,2007,5,29,88,65,77,10,59,66,12,421,1917,0.0,5.8,16,2
9,2007-05-29,CULEX RESTUANS,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,0,2007,5,29,88,65,77,10,59,66,12,421,1917,0.0,5.8,16,1


In [10]:
train.NumMosquitos.max()
# cool, think that works

2532

In [11]:
# re-dummying species

In [12]:
train = pd.get_dummies(data=train, columns=["Species"])

In [13]:
# changing to datetime
train["Date"] = pd.to_datetime(train.Date)

In [14]:
# setting date as index
train.set_index(train["Date"], drop=True, inplace=True)

In [15]:
# don't know why it's duplicating, but oh well
train.drop(labels=["Date"], inplace=True, axis=1)

In [16]:
train.head()

Unnamed: 0_level_0,AddressNumberAndStreet,Latitude,Longitude,WnvPresent,Year,Month,Day,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Cool,Sunrise,Sunset,PrecipTotal,ResultSpeed,ResultDir,NumMosquitos,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2007-05-29,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,1,0,0,1,0,0,0,0
2007-05-29,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,1,0,0,0,1,0,0,0
2007-05-29,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,1,0,0,0,1,0,0,0
2007-05-29,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,1,0,0,1,0,0,0,0
2007-05-29,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,0,2007,5,29,88,60,74,10,58,65,9,421,1917,0.0,5.8,18,4,0,0,0,1,0,0,0


In [17]:
# saving for later
train.to_csv("train_location_counts.csv")