# Intro Pandas 
### Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language. 

### First we look at Series

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# a Series is like an array, but it's indexed
obj = Series([1,2,3,4,5])
obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
# print just the values and indexes
print(obj.values)
print(obj.index)

[1 2 3 4 5]
RangeIndex(start=0, stop=5, step=1)


In [4]:
# create a Series for WW2 casualties
ww2_cas = Series([87000000, 43000000, 3000000, 2100000, 400000], index = ['USSR','Germany', 'China', 'Japan', 'USA'])
ww2_cas

USSR       87000000
Germany    43000000
China       3000000
Japan       2100000
USA          400000
dtype: int64

In [5]:
ww2_cas['Germany']

43000000

In [6]:
# Check wich coutries had casualties greater than 4 mill
ww2_cas[ww2_cas > 4000000]

USSR       87000000
Germany    43000000
dtype: int64

In [7]:
# We can tread the Series as an ordered dictionary
'USSR' in ww2_cas

True

In [8]:
# we can create a dictionary from this
ww2_dict = ww2_cas.to_dict()
ww2_dict

{'USSR': 87000000,
 'Germany': 43000000,
 'China': 3000000,
 'Japan': 2100000,
 'USA': 400000}

In [9]:
# we can feed dictionaries into series
ww2_series = Series(ww2_dict)
ww2_series

USSR       87000000
Germany    43000000
China       3000000
Japan       2100000
USA          400000
dtype: int64

In [10]:
# index a Series with a list, Argentina is not in the Series so it will give it NaN as a value
countries = ['China', 'Germany', 'Japan', 'Argentina']
obj2 = Series(ww2_dict, index = countries)
obj2

China         3000000.0
Germany      43000000.0
Japan         2100000.0
Argentina           NaN
dtype: float64

In [11]:
# isnull -> True if null, notnull -> True if notnull
pd.isnull(obj2)

China        False
Germany      False
Japan        False
Argentina     True
dtype: bool

In [12]:
pd.notnull(obj2)

China         True
Germany       True
Japan         True
Argentina    False
dtype: bool

In [13]:
ww2_series

USSR       87000000
Germany    43000000
China       3000000
Japan       2100000
USA          400000
dtype: int64

In [14]:
# add together two series
ww2_series + obj2

Argentina           NaN
China         6000000.0
Germany      86000000.0
Japan         4200000.0
USA                 NaN
USSR                NaN
dtype: float64

In [15]:
# name the series
obj2.name = "World War 2 Casualties"
obj2

China         3000000.0
Germany      43000000.0
Japan         2100000.0
Argentina           NaN
Name: World War 2 Casualties, dtype: float64

In [16]:
# name the index
obj2.index.name = "Coutries"
obj2

Coutries
China         3000000.0
Germany      43000000.0
Japan         2100000.0
Argentina           NaN
Name: World War 2 Casualties, dtype: float64

## Intro to DataFrames

In [17]:
import webbrowser
website = 'https://en.wikipedia.org/wiki/List_of_all-time_NFL_win%E2%80%93loss_records'

webbrowser.open(website)
# copy table to clipboard

True

In [18]:
nfl_frame = pd.read_clipboard()
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,1,Dallas Cowboys,914,520,388,6,0.572,1960,NFC East
1,2,Green Bay Packers,1368,756,574,38,0.567,1921,NFC North
2,3,New England Patriots,916,512,395,9,0.564,1960,AFC East
3,4,Chicago Bears,1402,769,591,42,0.563,1920,NFC North
4,5,Baltimore Ravens,384,214,169,1,0.559,1996,AFC North
5,6,Miami Dolphins,832,457,371,4,0.552,1966,AFC East
6,7,Minnesota Vikings,902,488,403,11,0.547,1961,NFC North
7,8,San Francisco 49ers,1034,545,475,14,0.534,1950,NFC West
8,9,New York Giants,1337,696,608,33,0.533,1925,NFC East
9,10,Denver Broncos,916,483,423,10,0.533,1960,AFC West


In [19]:
# read all data from CSV
nfl_data = pd.read_csv('nfl_data.csv', sep=';') 
nfl_data


Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,1,Dallas Cowboys,914,520,388,6,0.572,1960,NFC East
1,2,Green Bay Packers,1368,756,574,38,0.567,1921,NFC North
2,3,New England Patriots,916,512,395,9,0.564,1960,AFC East
3,4,Chicago Bears,1402,769,591,42,0.563,1920,NFC North
4,5,Baltimore Ravens,384,214,169,1,0.559,1996,AFC North
5,6,Miami Dolphins,832,457,371,4,0.552,1966,AFC East
6,7,Minnesota Vikings,902,488,403,11,0.547,1961,NFC North
7,8,San Francisco 49ers,1034,545,475,14,0.534,1950,NFC West
8,9,New York Giants,1337,696,608,33,0.533,1925,NFC East
9,10,Denver Broncos,916,483,423,10,0.533,1960,AFC West


In [20]:
nfl_data2 = DataFrame(nfl_data, columns = ['Team', 'First NFL Season', 'Division'])
nfl_data2

Unnamed: 0,Team,First NFL Season,Division
0,Dallas Cowboys,1960,NFC East
1,Green Bay Packers,1921,NFC North
2,New England Patriots,1960,AFC East
3,Chicago Bears,1920,NFC North
4,Baltimore Ravens,1996,AFC North
5,Miami Dolphins,1966,AFC East
6,Minnesota Vikings,1961,NFC North
7,San Francisco 49ers,1950,NFC West
8,New York Giants,1925,NFC East
9,Denver Broncos,1960,AFC West


In [21]:
# save our data to a pickle file
nfl_data.to_pickle("./nfl_data.pkl")

In [22]:
# read it to a new dataframe
nfl_frame = pd.read_pickle("./nfl_data.pkl")
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,1,Dallas Cowboys,914,520,388,6,0.572,1960,NFC East
1,2,Green Bay Packers,1368,756,574,38,0.567,1921,NFC North
2,3,New England Patriots,916,512,395,9,0.564,1960,AFC East
3,4,Chicago Bears,1402,769,591,42,0.563,1920,NFC North
4,5,Baltimore Ravens,384,214,169,1,0.559,1996,AFC North
5,6,Miami Dolphins,832,457,371,4,0.552,1966,AFC East
6,7,Minnesota Vikings,902,488,403,11,0.547,1961,NFC North
7,8,San Francisco 49ers,1034,545,475,14,0.534,1950,NFC West
8,9,New York Giants,1337,696,608,33,0.533,1925,NFC East
9,10,Denver Broncos,916,483,423,10,0.533,1960,AFC West


In [23]:
# call a column that does not exist in the dataframe
# Stadium is not in the dataframe, pandas will create a column with this name 

DataFrame(nfl_frame, columns = ['Team', 'GP', 'Won', 'Lost', 'Stadium']).head()


Unnamed: 0,Team,GP,Won,Lost,Stadium
0,Dallas Cowboys,914,520,388,
1,Green Bay Packers,1368,756,574,
2,New England Patriots,916,512,395,
3,Chicago Bears,1402,769,591,
4,Baltimore Ravens,384,214,169,


In [24]:
# call columns by its name
nfl_frame.Team.head()

0          Dallas Cowboys
1       Green Bay Packers
2    New England Patriots
3           Chicago Bears
4        Baltimore Ravens
Name: Team, dtype: object

In [25]:
# call rows by its index -> loc or iloc
nfl_frame.iloc[3]

Rank                            4
Team                Chicago Bears
GP                          1,402
Won                           769
Lost                          591
Tied                           42
Pct.                        0.563
First NFL Season             1920
Division                NFC North
Name: 3, dtype: object

In [26]:
# add new column, add numbers as values
nfl_frame['Stadium'] = np.arange(len(nfl_frame['Team']))
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division,Stadium
0,1,Dallas Cowboys,914,520,388,6,0.572,1960,NFC East,0
1,2,Green Bay Packers,1368,756,574,38,0.567,1921,NFC North,1
2,3,New England Patriots,916,512,395,9,0.564,1960,AFC East,2
3,4,Chicago Bears,1402,769,591,42,0.563,1920,NFC North,3
4,5,Baltimore Ravens,384,214,169,1,0.559,1996,AFC North,4
5,6,Miami Dolphins,832,457,371,4,0.552,1966,AFC East,5
6,7,Minnesota Vikings,902,488,403,11,0.547,1961,NFC North,6
7,8,San Francisco 49ers,1034,545,475,14,0.534,1950,NFC West,7
8,9,New York Giants,1337,696,608,33,0.533,1925,NFC East,8
9,10,Denver Broncos,916,483,423,10,0.533,1960,AFC West,9


In [27]:
# delete columns !!
del nfl_frame['Stadium']
nfl_frame.head()


Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,1,Dallas Cowboys,914,520,388,6,0.572,1960,NFC East
1,2,Green Bay Packers,1368,756,574,38,0.567,1921,NFC North
2,3,New England Patriots,916,512,395,9,0.564,1960,AFC East
3,4,Chicago Bears,1402,769,591,42,0.563,1920,NFC North
4,5,Baltimore Ravens,384,214,169,1,0.559,1996,AFC North


In [28]:
# convert equal lenght list in a dictionary to a DataFrame
age_data = {'People':['Peter', 'Bob', 'Marduk'], 'Age':[34, 43, 43]}
age_df = DataFrame(age_data)
age_df.head()

Unnamed: 0,People,Age
0,Peter,34
1,Bob,43
2,Marduk,43
