### Intro to pandas data structures - Part 1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
%matplotlib inline

### Series : 1D object similar to an array

In [6]:
# create a series with an arbitrary list
# s = pd.Series(myList)
s = pd.Series([7, 'Heisenberg', 3.14, -175, 'Happy Friday!'])
s

0                7
1       Heisenberg
2             3.14
3             -175
4    Happy Friday!
dtype: object

In [8]:
s = pd.Series([7, 'Heisenberg', 3.14, -175, 'Happy Friday!'], index=['A', 'Z', 'C', 'Y', 'E'])
s

A                7
Z       Heisenberg
C             3.14
Y             -175
E    Happy Friday!
dtype: object

In [10]:
d = {'Chicago':1000, 'New York':1300, 'Portland':900, 'San Francisco':1100, 'Austin':450, 'Boston':None}
cities = pd.Series(d)
cities

Chicago          1000.0
New York         1300.0
Portland          900.0
San Francisco    1100.0
Austin            450.0
Boston              NaN
dtype: float64

In [11]:
cities['Boston']

nan

In [14]:
# old way
cities[['San Francisco', 'New york', 'Chicago']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


San Francisco    1100.0
New york            NaN
Chicago          1000.0
dtype: float64

In [16]:
# new way
cities.reindex(['San Francisco', 'New york', 'Chicago'])

San Francisco    1100.0
New york            NaN
Chicago          1000.0
dtype: float64

In [20]:
cities[cities<1000]

Portland    900.0
Austin      450.0
dtype: float64

In [21]:
'Seatle' in cities

False

In [22]:
cities[cities<1000]=999

In [23]:
cities

Chicago          1000.0
New York         1300.0
Portland          999.0
San Francisco    1100.0
Austin            999.0
Boston              NaN
dtype: float64

In [24]:
# Mathematical operations

cities/4.0

Chicago          250.00
New York         325.00
Portland         249.75
San Francisco    275.00
Austin           249.75
Boston              NaN
dtype: float64

In [26]:
np.average(cities)

nan

In [27]:
cities['Boston'] = 999

In [30]:
round(np.average(cities), 3)

1066.167

In [33]:
cities.isnull()

Chicago          False
New York         False
Portland         False
San Francisco    False
Austin           False
Boston           False
dtype: bool

In [34]:
cities.notnull()

Chicago          True
New York         True
Portland         True
San Francisco    True
Austin           True
Boston           True
dtype: bool

### DataFrame : A tabular data structure
Think of it as -- A group of Series objects that share an index

In [36]:
%cd ~/Dropbox/Data/

/Users/sathisanvannadil/Dropbox/Data


In [41]:
!head -n 7 mariano-rivera.csv

Year,Age,Tm,Lg,W,L,W-L%,ERA,G,GS,GF,CG,SHO,SV,IP,H,R,ER,HR,BB,IBB,SO,HBP,BK,WP,BF,ERA+,WHIP,H/9,HR/9,BB/9,SO/9,SO/BB,Awards
1995,25,NYY,AL,5,3,.625,5.51,19,10,2,0,0,0,67.0,71,43,41,11,30,0,51,2,1,0,301,84,1.507,9.5,1.5,4.0,6.9,1.70,
1996,26,NYY,AL,8,3,.727,2.09,61,0,14,0,0,5,107.2,73,25,25,1,34,3,130,2,0,1,425,240,0.994,6.1,0.1,2.8,10.9,3.82,CYA-3MVP-12
1997,27,NYY,AL,6,4,.600,1.88,66,0,56,0,0,43,71.2,65,17,15,5,20,6,68,0,0,2,301,239,1.186,8.2,0.6,2.5,8.5,3.40,ASMVP-25
1998,28,NYY,AL,3,0,1.000,1.91,54,0,49,0,0,36,61.1,48,13,13,3,17,1,36,1,0,0,246,233,1.060,7.0,0.4,2.5,5.3,2.12,
1999,29,NYY,AL,4,3,.571,1.83,66,0,63,0,0,45,69.0,43,15,14,2,18,3,52,3,1,2,268,257,0.884,5.6,0.3,2.3,6.8,2.89,ASCYA-3MVP-14
2000,30,NYY,AL,7,4,.636,2.85,66,0,61,0,0,36,75.2,58,26,24,4,25,3,58,0,0,2,311,170,1.097,6.9,0.5,3.0,6.9,2.32,AS


In [38]:
from_csv = pd.read_csv('mariano-rivera.csv')

In [39]:
from_csv.head()

Unnamed: 0,Year,Age,Tm,Lg,W,L,W-L%,ERA,G,GS,GF,CG,SHO,SV,IP,H,R,ER,HR,BB,IBB,SO,HBP,BK,WP,BF,ERA+,WHIP,H/9,HR/9,BB/9,SO/9,SO/BB,Awards
0,1995,25,NYY,AL,5,3,0.625,5.51,19,10,2,0,0,0,67.0,71,43,41,11,30,0,51,2,1,0,301,84,1.507,9.5,1.5,4.0,6.9,1.7,
1,1996,26,NYY,AL,8,3,0.727,2.09,61,0,14,0,0,5,107.2,73,25,25,1,34,3,130,2,0,1,425,240,0.994,6.1,0.1,2.8,10.9,3.82,CYA-3MVP-12
2,1997,27,NYY,AL,6,4,0.6,1.88,66,0,56,0,0,43,71.2,65,17,15,5,20,6,68,0,0,2,301,239,1.186,8.2,0.6,2.5,8.5,3.4,ASMVP-25
3,1998,28,NYY,AL,3,0,1.0,1.91,54,0,49,0,0,36,61.1,48,13,13,3,17,1,36,1,0,0,246,233,1.06,7.0,0.4,2.5,5.3,2.12,
4,1999,29,NYY,AL,4,3,0.571,1.83,66,0,63,0,0,45,69.0,43,15,14,2,18,3,52,3,1,2,268,257,0.884,5.6,0.3,2.3,6.8,2.89,ASCYA-3MVP-14


In [42]:
!head -n 5 peyton-passing-TDs-2012.csv

1,1,2012-09-09,DEN,,PIT,W 31-19,3,71,Demaryius Thomas,Trail 7-13,Lead 14-13*
2,1,2012-09-09,DEN,,PIT,W 31-19,4,1,Jacob Tamme,Trail 14-19,Lead 22-19*
3,2,2012-09-17,DEN,@,ATL,L 21-27,2,17,Demaryius Thomas,Trail 0-20,Trail 7-20
4,3,2012-09-23,DEN,,HOU,L 25-31,4,38,Brandon Stokley,Trail 11-31,Trail 18-31
5,3,2012-09-23,DEN,,HOU,L 25-31,4,6,Joel Dreessen,Trail 18-31,Trail 25-31


In [43]:
cols = ['num', 'game', 'date', 'team', 'home_away', 'opponent', 'result', 'quarter', 'distance', 'receiver', 'score_before', 'score_after']
no_headers = pd.read_csv('peyton-passing-TDs-2012.csv', sep=',', header=None, names=cols)
no_headers.head()

Unnamed: 0,num,game,date,team,home_away,opponent,result,quarter,distance,receiver,score_before,score_after
0,1,1,2012-09-09,DEN,,PIT,W 31-19,3,71,Demaryius Thomas,Trail 7-13,Lead 14-13*
1,2,1,2012-09-09,DEN,,PIT,W 31-19,4,1,Jacob Tamme,Trail 14-19,Lead 22-19*
2,3,2,2012-09-17,DEN,@,ATL,L 21-27,2,17,Demaryius Thomas,Trail 0-20,Trail 7-20
3,4,3,2012-09-23,DEN,,HOU,L 25-31,4,38,Brandon Stokley,Trail 11-31,Trail 18-31
4,5,3,2012-09-23,DEN,,HOU,L 25-31,4,6,Joel Dreessen,Trail 18-31,Trail 25-31


In [46]:
data = {
    'year':[2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012 ],
    'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
    'wins': [11, 8, 10, 15, 11, 6, 10, 4],
    'losses': [5,8,6,1,5,10, 6,12]
}

football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
football

Unnamed: 0,year,team,wins,losses
0,2010,Bears,11,5
1,2011,Bears,8,8
2,2012,Bears,10,6
3,2011,Packers,15,1
4,2012,Packers,11,5
5,2010,Lions,6,10
6,2011,Lions,10,6
7,2012,Lions,4,12


In [47]:
football.to_excel('football.xlsx', index=False)

In [48]:
pwd

'/Users/sathisanvannadil/Dropbox/Data'

In [49]:
#delete the file
!ls -l football.xlsx

-rw-r--r--@ 1 sathisanvannadil  staff  5588 Jul 20 11:28 football.xlsx


In [50]:
del football

In [52]:
# read from Excel
football = pd.read_excel('football.xlsx', 'Sheet1')
football

Unnamed: 0,year,team,wins,losses
0,2010,Bears,11,5
1,2011,Bears,8,8
2,2012,Bears,10,6
3,2011,Packers,15,1
4,2012,Packers,11,5
5,2010,Lions,6,10
6,2011,Lions,10,6
7,2012,Lions,4,12
