In [1]:
import pandas as pd

In [2]:
from pandas import Series, DataFrame

In [3]:
Series

pandas.core.series.Series

In [4]:
# list vs Series

In [5]:
nums = [300, 200, 100]
nums

[300, 200, 100]

In [6]:
s = Series(nums)
s

0    300
1    200
2    100
dtype: int64

In [7]:
list(s)

[300, 200, 100]

In [8]:
# dict vs. Series

In [9]:
d = {"one": 1, "two": 2, "three": 3}
d

{'one': 1, 'two': 2, 'three': 3}

In [10]:
s = Series(d)
s

one      1
two      2
three    3
dtype: int64

In [11]:
s["two"]

2

In [12]:
s[1:]

two      2
three    3
dtype: int64

In [13]:
Series({0:99, 1:88, 2:77})

0    99
1    88
2    77
dtype: int64

In [14]:
s = Series([99,88,77]) # very similar to above
s

0    99
1    88
2    77
dtype: int64

In [15]:
# this fails:
# s[-1]

In [16]:
s = Series({-1:55, 0:99, 1:88, 2:77})
s

-1    55
 0    99
 1    88
 2    77
dtype: int64

In [17]:
s[-1], s[1]

(55, 88)

In [18]:
# this fails:
# s[55]

In [19]:
nums = Series([100,200,300])
nums

0    100
1    200
2    300
dtype: int64

In [20]:
nums + 1

0    101
1    201
2    301
dtype: int64

In [21]:
shorter = Series([5, 6])
nums + shorter

0    105.0
1    206.0
2      NaN
dtype: float64

In [22]:
list(nums) + list(shorter)

[100, 200, 300, 5, 6]

In [23]:
shorter

0    5
1    6
dtype: int64

In [24]:
shorter * shorter

0    25
1    36
dtype: int64

In [25]:
s1 = Series([3,4,5])
s2 = Series([5,4,3])

In [26]:
s1

0    3
1    4
2    5
dtype: int64

In [27]:
s2

0    5
1    4
2    3
dtype: int64

In [28]:
s1 > s2

0    False
1    False
2     True
dtype: bool

In [29]:
s1 == s2

0    False
1     True
2    False
dtype: bool

In [30]:
s1 = Series([False, False, True])
s2 = Series([False, True, True])

In [31]:
s1 | s2 # | is the same as or

0    False
1     True
2     True
dtype: bool

In [32]:
s1 & s2 # & is the same as and

0    False
1    False
2     True
dtype: bool

In [33]:
# data alignment

In [34]:
s1 = Series([100, 200, 300])
s1

0    100
1    200
2    300
dtype: int64

In [35]:
s2 = Series([11, 22, 33], index=[3,2,1])
s2

3    11
2    22
1    33
dtype: int64

In [36]:
s1 + s2

0      NaN
1    233.0
2    322.0
3      NaN
dtype: float64

In [37]:
# fancy indexing

# obj[X]
#
# what can X be?
# 1. index (int)
# 2. key (a lot of things)
# 3. Series (Series[Series])    <-- this is fancy indexing

In [38]:
s = Series(["A", "B", "C", "D"])
s

0    A
1    B
2    C
3    D
dtype: object

In [39]:
b1 = Series({0:True, 1:False, 2:False, 3:True})
b2 = Series([True, False, False, True])
b2

0     True
1    False
2    False
3     True
dtype: bool

In [40]:
s[b1]

0    A
3    D
dtype: object

In [41]:
s[b2]

0    A
3    D
dtype: object

In [42]:
# combine element-wise bool ops with fancy indexing

In [43]:
s = Series([1, 99, 50, 40, 45, 20])
s

0     1
1    99
2    50
3    40
4    45
5    20
dtype: int64

In [44]:
b = s < 25
b

0     True
1    False
2    False
3    False
4    False
5     True
dtype: bool

In [45]:
s[b]

0     1
5    20
dtype: int64

In [46]:
s[s < 25]

0     1
5    20
dtype: int64

In [47]:
s[s > 40]

1    99
2    50
4    45
dtype: int64

In [48]:
list(s[s > 40])

[99, 50, 45]

In [49]:
Series(list(s[s > 40]))

0    99
1    50
2    45
dtype: int64

In [50]:
s = Series([1, 99, 50, 40, 45, 20])
s

0     1
1    99
2    50
3    40
4    45
5    20
dtype: int64

In [51]:
# want all the odd numbers
s[s % 2 == 1]

0     1
1    99
4    45
dtype: int64

In [52]:
# want odd numbers greater than 10
s % 2 == 1

0     True
1     True
2    False
3    False
4     True
5    False
dtype: bool

In [53]:
s > 10

0    False
1     True
2     True
3     True
4     True
5     True
dtype: bool

In [54]:
s[(s % 2 == 1) & (s > 10)]

1    99
4    45
dtype: int64

In [55]:
s = Series(["dog", "CAT", "apple", "BANANA"])
s

0       dog
1       CAT
2     apple
3    BANANA
dtype: object

In [56]:
s.str.upper()

0       DOG
1       CAT
2     APPLE
3    BANANA
dtype: object

In [57]:
s == s.str.upper()

0    False
1     True
2    False
3     True
dtype: bool

In [58]:
s[s == s.str.upper()]

1       CAT
3    BANANA
dtype: object

In [59]:
s = Series(["dog", "apple"])
s[s == s.str.upper()]

Series([], dtype: object)

In [64]:
# DataFrames = Tables

In [65]:
names = Series(["Alice", "Bob", "Cindy"])
scores = Series([10, 20, 30])

In [66]:
names

0    Alice
1      Bob
2    Cindy
dtype: object

In [67]:
scores

0    10
1    20
2    30
dtype: int64

In [68]:
DataFrame({"name":names, "score":scores})

Unnamed: 0,name,score
0,Alice,10
1,Bob,20
2,Cindy,30


In [78]:
df = DataFrame({"name": ["Alice", "Bob", "Cindy"],
                "score":[10, 20, 30],
                "where": [0,0,0]})

In [79]:
df

Unnamed: 0,name,score,where
0,Alice,10,0
1,Bob,20,0
2,Cindy,30,0


In [72]:
df["score"]

0    10
1    20
2    30
Name: score, dtype: int64

In [73]:
df["name"]

0    Alice
1      Bob
2    Cindy
Name: name, dtype: object

In [75]:
df.score # shorthand for df["score"]

0    10
1    20
2    30
Name: score, dtype: int64

In [81]:
df["where"]

0    0
1    0
2    0
Name: where, dtype: int64

In [82]:
df

Unnamed: 0,name,score,where
0,Alice,10,0
1,Bob,20,0
2,Cindy,30,0


In [83]:
df.loc[2]

name     Cindy
score       30
where        0
Name: 2, dtype: object

In [84]:
df.loc[1]

name     Bob
score     20
where      0
Name: 1, dtype: object

In [85]:
df.iloc[1]

name     Bob
score     20
where      0
Name: 1, dtype: object

In [88]:
df

Unnamed: 0,name,score,where
0,Alice,10,0
1,Bob,20,0
2,Cindy,30,0


In [90]:
new_df = df.set_index("name")
new_df

Unnamed: 0_level_0,score,where
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,10,0
Bob,20,0
Cindy,30,0


In [92]:
new_df.loc["Cindy"]

score    30
where     0
Name: Cindy, dtype: int64

In [93]:
new_df.iloc[2]

score    30
where     0
Name: Cindy, dtype: int64

In [94]:
new_df.iloc[-1]

score    30
where     0
Name: Cindy, dtype: int64

In [95]:
new_df

Unnamed: 0_level_0,score,where
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,10,0
Bob,20,0
Cindy,30,0


In [97]:
new_df.loc["Bob", "score"]

20

In [98]:
new_df.loc["Bob", "score"] = 21

In [99]:
new_df

Unnamed: 0_level_0,score,where
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,10,0
Bob,21,0
Cindy,30,0


In [100]:
df

Unnamed: 0,name,score,where
0,Alice,10,0
1,Bob,20,0
2,Cindy,30,0


In [101]:
new_df

Unnamed: 0_level_0,score,where
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,10,0
Bob,21,0
Cindy,30,0


In [102]:
new_df

Unnamed: 0_level_0,score,where
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,10,0
Bob,21,0
Cindy,30,0


In [104]:
new_df.iloc[2, 0] = 31

In [105]:
new_df

Unnamed: 0_level_0,score,where
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,10,0
Bob,21,0
Cindy,31,0


In [106]:
# ways to create a DataFrame

In [107]:
df = DataFrame({"name": ["Alice", "Bob", "Cindy"],
                "score":[10, 20, 30],
                "where": [0,0,0]})

In [109]:
df = pd.read_csv("Fifa18.csv")

In [111]:
df.head()

Unnamed: 0,Id,name,Age,nationality,club,league,euro_wage,networth,score_of_100
0,20801,Cristiano Ronaldo,32,Portugal,Real Madrid CF,Spanish Primera División,565000,95500000,94
1,158023,L. Messi,30,Argentina,FC Barcelona,Spanish Primera División,565000,105000000,93
2,190871,Neymar,25,Brazil,Paris Saint-Germain,French Ligue 1,280000,123000000,92
3,176580,L. Suárez,30,Uruguay,FC Barcelona,Spanish Primera División,510000,97000000,92
4,167495,M. Neuer,31,Germany,FC Bayern Munich,German Bundesliga,230000,61000000,92


In [115]:
df["euro_wage"].max()

565000

In [116]:
df["Age"].max()

47

In [None]:
# Fancy Indexing
# 1. Series[BoolSeries]
# 2. DataFrame[BoolSeries]

In [118]:
df[df["Age"] == df["Age"].max()]

Unnamed: 0,Id,name,Age,nationality,club,league,euro_wage,networth,score_of_100
17465,11728,B. Richardson,47,England,Wycombe Wanderers,English League Two,1000,0,46


In [119]:
df[df["Age"] >= 40]

Unnamed: 0,Id,name,Age,nationality,club,league,euro_wage,networth,score_of_100
1322,137854,M. Storari,40,Italy,Milan,Italian Serie A,38000,525000,76
2126,3665,B. Nivet,40,France,ES Troyes AC,French Ligue 1,16000,0,74
3022,17605,T. Simons,40,Belgium,Club Brugge KV,Belgian First Division A,14000,0,73
3182,176900,M. Candelo,40,Colombia,Asociacion Deportivo Cali,Colombian Primera A,2000,0,72
4685,140029,O. Pérez,44,Mexico,Pachuca,Mexican Liga MX,9000,160000,71
5149,188033,E. El Hadary,44,Egypt,Al Taawoun,Saudi Professional League,7000,120000,70
6458,20731,Quim,41,Portugal,CD Aves,Portuguese Primeira Liga,3000,70000,69
7505,148745,K. Wæhler,41,Norway,Sogndal,Norwegian Eliteserien,2000,0,67
15719,591,C. Day,41,England,Stevenage,English League Two,1000,10000,57
16276,53506,D. Coyne,43,Wales,Shrewsbury,English League One,1000,10000,55


In [123]:
stats = df.describe()
stats

Unnamed: 0,Id,Age,euro_wage,networth,score_of_100
count,17469.0,17469.0,17469.0,17469.0,17469.0
mean,207584.09846,25.133265,11696.433683,2407283.0,66.271395
std,32339.098668,4.597354,23249.349262,5391004.0,6.931154
min,16.0,16.0,1000.0,0.0,46.0
25%,192567.0,21.0,2000.0,325000.0,62.0
50%,214016.0,25.0,4000.0,700000.0,66.0
75%,231383.0,28.0,12000.0,2100000.0,71.0
max,241219.0,47.0,565000.0,123000000.0,94.0


In [126]:
stddev = stats.loc["std", "euro_wage"]
stddev

23249.349262210257

In [129]:
df[df.euro_wage > df.euro_wage.mean() + 5*stddev]

Unnamed: 0,Id,name,Age,nationality,club,league,euro_wage,networth,score_of_100
0,20801,Cristiano Ronaldo,32,Portugal,Real Madrid CF,Spanish Primera División,565000,95500000,94
1,158023,L. Messi,30,Argentina,FC Barcelona,Spanish Primera División,565000,105000000,93
2,190871,Neymar,25,Brazil,Paris Saint-Germain,French Ligue 1,280000,123000000,92
3,176580,L. Suárez,30,Uruguay,FC Barcelona,Spanish Primera División,510000,97000000,92
4,167495,M. Neuer,31,Germany,FC Bayern Munich,German Bundesliga,230000,61000000,92
5,188545,R. Lewandowski,28,Poland,FC Bayern Munich,German Bundesliga,355000,92000000,91
6,193080,De Gea,26,Spain,Manchester United,English Premier League,215000,64500000,90
7,183277,E. Hazard,26,Belgium,Chelsea,English Premier League,295000,90500000,90
8,182521,T. Kroos,27,Germany,Real Madrid CF,Spanish Primera División,340000,79000000,90
9,167664,G. Higuaín,29,Argentina,Juventus,Italian Serie A,275000,77000000,90
