<p><h1>Pandas</h1></p>

In [6]:
import numpy as np
import pandas as pd

<p><h3>Working with series</h3></p> series - array type objects

In [7]:
x = pd.Series([1,2,3,4,5])
x

0    1
1    2
2    3
3    4
4    5
dtype: int64

<p><h3>Basic operations</h3></p>

In [8]:
x+3

0    4
1    5
2    6
3    7
4    8
dtype: int64

In [9]:
(x**2)+100

0    101
1    104
2    109
3    116
4    125
dtype: int64

In [10]:
x>2

0    False
1    False
2     True
3     True
4     True
dtype: bool

<p><h3> any() </h3> - returns True if there is at least one value True</p> 
<p><h3>all() </h3> - returns True is all the values are True </p>

In [11]:
larger_than_2 = x>2
larger_than_2

0    False
1    False
2     True
3     True
4     True
dtype: bool

In [12]:
larger_than_2.any()

True

In [13]:
larger_than_2.all()

False

<p><h3>apply()</h3> - applies function to the series without loop, instead of running it item by itme </p>

In [14]:
def f(x): # if the number is even it is multiplied by two, otherwise it is multiplied by 3
    if x%2 == 0: 
        return x*2
    else:
        return x*3

x.apply(f)    

0     3
1     4
2     9
3     8
4    15
dtype: int64

<p><strong>Avoid looping through the series</strong></p>
<p>Task: compare the runtime with loop and without</p>

In [17]:
%%timeit
ds = pd.Series(range(1,10000))
for counter in range(len(ds)):
    ds[counter] = f(ds[counter])

10 loops, best of 3: 140 ms per loop


In [19]:
%%timeit
ds = pd.Series(range(1,10000))
ds.apply(f)

100 loops, best of 3: 11.1 ms per loop


In [20]:
x.astype(np.float64)

0    1
1    2
2    3
3    4
4    5
dtype: float64

In [21]:
y = x

In [22]:
y[0]

1

In [23]:
y[0] = 100

In [24]:
y

0    100
1      2
2      3
3      4
4      5
dtype: int64

In [25]:
x

0    100
1      2
2      3
3      4
4      5
dtype: int64

In [26]:
y = x.copy()

In [27]:
x[0] = 1

In [28]:
x

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [29]:
y

0    100
1      2
2      3
3      4
4      5
dtype: int64

In [59]:
x.describe(percentile_width = 50)



count    5.000000
mean     3.000000
std      1.581139
min      1.000000
25%      2.000000
50%      3.000000
75%      4.000000
max      5.000000
dtype: float64

<h3>Data Frame</h3>

In [31]:
def factorial (x):
    print ("%s! = %s" % (x, np.math.factorial(x)))
    return np.math.factorial(x)
    

In [32]:
data = [1,2,3,4,5,6,7,8,9]
df = pd.DataFrame(data, columns = ["x"])

In [33]:
df

Unnamed: 0,x
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9


In [34]:
df["x"]

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
Name: x, dtype: int64

In [35]:
df["x"][0]

1

In [36]:
df["$x+2$"] = df["x"]+2

In [37]:
df

Unnamed: 0,x,$x+2$
0,1,3
1,2,4
2,3,5
3,4,6
4,5,7
5,6,8
6,7,9
7,8,10
8,9,11


In [38]:
df["$x^2$"] = df["x"]**2
df["$x!$"] = df["x"].apply(np.math.factorial)

In [39]:
df

Unnamed: 0,x,$x+2$,$x^2$,$x!$
0,1,3,1,1
1,2,4,4,2
2,3,5,9,6
3,4,6,16,24
4,5,7,25,120
5,6,8,36,720
6,7,9,49,5040
7,8,10,64,40320
8,9,11,81,362880


In [40]:
df["is_even"] = df["x"]%2 == 0

In [41]:
df

Unnamed: 0,x,$x+2$,$x^2$,$x!$,is_even
0,1,3,1,1,False
1,2,4,4,2,True
2,3,5,9,6,False
3,4,6,16,24,True
4,5,7,25,120,False
5,6,8,36,720,True
6,7,9,49,5040,False
7,8,10,64,40320,True
8,9,11,81,362880,False


In [42]:
df["odd_even"] = df["is_even"].map({False: "odd", True: "even"})
df

Unnamed: 0,x,$x+2$,$x^2$,$x!$,is_even,odd_even
0,1,3,1,1,False,odd
1,2,4,4,2,True,even
2,3,5,9,6,False,odd
3,4,6,16,24,True,even
4,5,7,25,120,False,odd
5,6,8,36,720,True,even
6,7,9,49,5040,False,odd
7,8,10,64,40320,True,even
8,9,11,81,362880,False,odd


In [43]:
df

Unnamed: 0,x,$x+2$,$x^2$,$x!$,is_even,odd_even
0,1,3,1,1,False,odd
1,2,4,4,2,True,even
2,3,5,9,6,False,odd
3,4,6,16,24,True,even
4,5,7,25,120,False,odd
5,6,8,36,720,True,even
6,7,9,49,5040,False,odd
7,8,10,64,40320,True,even
8,9,11,81,362880,False,odd


In [44]:
df = df.drop("is_even",1)
df

Unnamed: 0,x,$x+2$,$x^2$,$x!$,odd_even
0,1,3,1,1,odd
1,2,4,4,2,even
2,3,5,9,6,odd
3,4,6,16,24,even
4,5,7,25,120,odd
5,6,8,36,720,even
6,7,9,49,5040,odd
7,8,10,64,40320,even
8,9,11,81,362880,odd


In [45]:
df[["x", "odd_even"]]

Unnamed: 0,x,odd_even
0,1,odd
1,2,even
2,3,odd
3,4,even
4,5,odd
5,6,even
6,7,odd
7,8,even
8,9,odd


In [46]:
df[df["odd_even"]=="odd"]

Unnamed: 0,x,$x+2$,$x^2$,$x!$,odd_even
0,1,3,1,1,odd
2,3,5,9,6,odd
4,5,7,25,120,odd
6,7,9,49,5040,odd
8,9,11,81,362880,odd


In [47]:
df[df.odd_even == "odd"]

Unnamed: 0,x,$x+2$,$x^2$,$x!$,odd_even
0,1,3,1,1,odd
2,3,5,9,6,odd
4,5,7,25,120,odd
6,7,9,49,5040,odd
8,9,11,81,362880,odd


In [48]:
df[(df.odd_even == "odd") | (df["$x^2$"]<20)]

Unnamed: 0,x,$x+2$,$x^2$,$x!$,odd_even
0,1,3,1,1,odd
1,2,4,4,2,even
2,3,5,9,6,odd
3,4,6,16,24,even
4,5,7,25,120,odd
6,7,9,49,5040,odd
8,9,11,81,362880,odd


In [49]:
df[(df["odd_even"] == "odd") & (df["$x^2$"]<20)]

Unnamed: 0,x,$x+2$,$x^2$,$x!$,odd_even
0,1,3,1,1,odd
2,3,5,9,6,odd


In [50]:
pd.scatter_matrix(df, diagonal = "kde", figsize = (10,10))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x107e78ac8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bab5898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bc02b70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bc24860>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10bc8b8d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bcd9080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10be0d208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10be572e8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10be92f28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bee1208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bee7a20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bf63e48>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10bfb35f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10bfed8d0>,
   

In [51]:
df.describe()

Unnamed: 0,x,$x+2$,$x^2$,$x!$
count,9.0,9.0,9.0,9.0
mean,5.0,7.0,31.666667,45457.0
std,2.738613,2.738613,28.080242,119758.341137
min,1.0,3.0,1.0,1.0
25%,3.0,5.0,9.0,6.0
50%,5.0,7.0,25.0,120.0
75%,7.0,9.0,49.0,5040.0
max,9.0,11.0,81.0,362880.0


Reading from csv file

In [52]:
url = "http://www.google.com/finance/historical?q=TADAWUL:TASI&output=csv"
stocks_data = pd.read_csv(url)

In [53]:
stocks_data

Unnamed: 0,﻿Date,Open,High,Low,Close,Volume
0,28-Oct-15,7097.59,7122.68,6990.88,7118.42,234567321
1,27-Oct-15,7319.15,7319.15,7033.29,7097.59,254195186
2,26-Oct-15,7276.26,7320.23,7192.05,7319.15,190660876
3,25-Oct-15,7382.59,7433.71,7273.69,7276.26,162113724
4,22-Oct-15,7479.01,7481.08,7312.51,7382.59,215101904
5,21-Oct-15,7689.76,7690.57,7471.36,7479.01,226627038
6,20-Oct-15,7765.22,7769.84,7678.39,7689.76,201544837
7,19-Oct-15,7792.62,7793.61,7750.49,7765.22,213313899
8,18-Oct-15,7698.73,7800.10,7699.53,7792.62,222717069
9,15-Oct-15,7784.50,7784.50,7698.17,7698.73,223510840


In [54]:
stocks_data["change_amount"] = stocks_data["Close"] - stocks_data["Open"]
stocks_data["change_percentage"] = stocks_data["change_amount"] / stocks_data["Close"]
stocks_data

Unnamed: 0,﻿Date,Open,High,Low,Close,Volume,change_amount,change_percentage
0,28-Oct-15,7097.59,7122.68,6990.88,7118.42,234567321,20.83,0.002926
1,27-Oct-15,7319.15,7319.15,7033.29,7097.59,254195186,-221.56,-0.031216
2,26-Oct-15,7276.26,7320.23,7192.05,7319.15,190660876,42.89,0.005860
3,25-Oct-15,7382.59,7433.71,7273.69,7276.26,162113724,-106.33,-0.014613
4,22-Oct-15,7479.01,7481.08,7312.51,7382.59,215101904,-96.42,-0.013060
5,21-Oct-15,7689.76,7690.57,7471.36,7479.01,226627038,-210.75,-0.028179
6,20-Oct-15,7765.22,7769.84,7678.39,7689.76,201544837,-75.46,-0.009813
7,19-Oct-15,7792.62,7793.61,7750.49,7765.22,213313899,-27.40,-0.003529
8,18-Oct-15,7698.73,7800.10,7699.53,7792.62,222717069,93.89,0.012049
9,15-Oct-15,7784.50,7784.50,7698.17,7698.73,223510840,-85.77,-0.011141
