In [1]:
import pandas as pd
import numpy as np
from numpy.random import randn

### Constructing DataFrame

In [2]:
# Method 1
d = {'a': [1,2,3,4], 'b': [5,6,7,8]}
df = pd.DataFrame(data = d)

In [3]:
df

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8


In [4]:
df.dtypes

a    int64
b    int64
dtype: object

In [5]:
# Method 2 --- from numpy ndarray
df2 = pd.DataFrame(np.random.randint(low = 0, high = 10, size = (5,5)),
                  columns = ['a', 'b', 'c', 'd', 'e'])
df2

Unnamed: 0,a,b,c,d,e
0,1,9,0,1,9
1,5,9,0,9,0
2,0,1,5,2,8
3,8,1,7,8,2
4,1,4,3,7,2


### Pandas Dataframe VS. Pandas Series

Seires is the datastructure for a single column of a DataFrame

In [6]:
# DataFrame
df2[['a']]

Unnamed: 0,a
0,1
1,5
2,0
3,8
4,1


In [7]:
# Select Multiple columns as DataFrame
df2[['a', 'b']]

Unnamed: 0,a,b
0,1,9
1,5,9
2,0,1
3,8,1
4,1,4


In [8]:
# Series
df2['a']

0    1
1    5
2    0
3    8
4    1
Name: a, dtype: int64

### Selection & Indexing

In [9]:
df

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8


In [10]:
df['a']

0    1
1    2
2    3
3    4
Name: a, dtype: int64

In [11]:
df[['a', 'b']]

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8


### Creat New Column

In [12]:
df['new'] = df['a'] * df['b']

In [13]:
df

Unnamed: 0,a,b,new
0,1,5,5
1,2,6,12
2,3,7,21
3,4,8,32


### Removing Columns

In [14]:
df

Unnamed: 0,a,b,new
0,1,5,5
1,2,6,12
2,3,7,21
3,4,8,32


In [15]:
df.drop('b', axis = 1) # axis = 1 for col, axis = 0 for row

Unnamed: 0,a,new
0,1,5
1,2,12
2,3,21
3,4,32


In [16]:
df.drop(2,axis=0)

Unnamed: 0,a,b,new
0,1,5,5
1,2,6,12
3,4,8,32


### Selecting Rows

**loc** gets rows (or columns) with particular **labels** from the index. <br/>
**iloc** gets rows (or columns) at particular **positions** in the index (so it only takes integers). <br/>

###### loc

In [17]:
df.loc[3]

a       4
b       8
new    32
Name: 3, dtype: int64

In [18]:
df3 = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())

In [19]:
df3

Unnamed: 0,W,X,Y,Z
A,0.96708,1.679661,0.575509,1.543603
B,0.455325,-1.615298,-1.049094,-1.029619
C,0.090128,1.492557,-1.222476,-0.786785
D,-0.089867,1.298835,0.113604,0.599789
E,0.369906,-0.074022,-0.084495,-1.90835


In [20]:
df3.loc['D']

W   -0.089867
X    1.298835
Y    0.113604
Z    0.599789
Name: D, dtype: float64

In [21]:
df3.loc['E', 'Y']

-0.08449523694827309

In [22]:
df3.loc[df3['X' ]>1]

Unnamed: 0,W,X,Y,Z
A,0.96708,1.679661,0.575509,1.543603
C,0.090128,1.492557,-1.222476,-0.786785
D,-0.089867,1.298835,0.113604,0.599789


In [23]:
df3.loc[df3['X' ]>1, ['X', 'Y']]

Unnamed: 0,X,Y
A,1.679661,0.575509
C,1.492557,-1.222476
D,1.298835,0.113604


In [24]:
df3.loc[df3['X'] > 1, 'X'] # returns a series

A    1.679661
C    1.492557
D    1.298835
Name: X, dtype: float64

In [25]:
df3.loc[df3['X'] > 1, ['X']] # returns a dataframe

Unnamed: 0,X
A,1.679661
C,1.492557
D,1.298835


##### iloc

In [26]:
df3.iloc[2]

W    0.090128
X    1.492557
Y   -1.222476
Z   -0.786785
Name: C, dtype: float64

In [27]:
df3.iloc[2,3]

-0.7867847554594302

In [28]:
df3.iloc[0:3] # first 3 rows of dataframe

Unnamed: 0,W,X,Y,Z
A,0.96708,1.679661,0.575509,1.543603
B,0.455325,-1.615298,-1.049094,-1.029619
C,0.090128,1.492557,-1.222476,-0.786785


In [29]:
df3.iloc[:, 0:2] # first two columns of data frame with all rows

Unnamed: 0,W,X
A,0.96708,1.679661
B,0.455325,-1.615298
C,0.090128,1.492557
D,-0.089867,1.298835
E,0.369906,-0.074022


In [30]:
df3.iloc[[0,3,4], [1,2]] # 1st, 4th, 5th row with 2nd and 3rd columns

Unnamed: 0,X,Y
A,1.679661,0.575509
D,1.298835,0.113604
E,-0.074022,-0.084495


In [31]:
df3.iloc[0:3, 2:4] # first 3 rows, and 3rd to 5th columns

Unnamed: 0,Y,Z
A,0.575509,1.543603
B,-1.049094,-1.029619
C,-1.222476,-0.786785


###### .ix

Note: The ix indexer has been deprecated in recent versions of Pandas, starting with version 0.20.1.

###### Question-> How to check the version of your package?

In [32]:
pd.__version__

'0.23.4'

### Conditional Selection

In [33]:
df3

Unnamed: 0,W,X,Y,Z
A,0.96708,1.679661,0.575509,1.543603
B,0.455325,-1.615298,-1.049094,-1.029619
C,0.090128,1.492557,-1.222476,-0.786785
D,-0.089867,1.298835,0.113604,0.599789
E,0.369906,-0.074022,-0.084495,-1.90835


In [34]:
df3 < 0

Unnamed: 0,W,X,Y,Z
A,False,False,False,False
B,False,True,True,True
C,False,False,True,True
D,True,False,False,False
E,False,True,True,True


In [35]:
df3[df3<0]

Unnamed: 0,W,X,Y,Z
A,,,,
B,,-1.615298,-1.049094,-1.029619
C,,,-1.222476,-0.786785
D,-0.089867,,,
E,,-0.074022,-0.084495,-1.90835


In [36]:
df3[df3<0]['W']

A         NaN
B         NaN
C         NaN
D   -0.089867
E         NaN
Name: W, dtype: float64

### Summarising, Aggregating and Groupying data in pandas

In [39]:
cust = pd.read_csv("../data/customers.csv")

In [40]:
cust.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


### unique, nunique

In [41]:
cust['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [42]:
cust['Gender'].nunique()

2

### Groupby

In [43]:
cust.groupby(['Spending Score (1-100)']).groups.keys()

dict_keys([1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 26, 27, 28, 29, 31, 32, 34, 35, 36, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 65, 66, 68, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 97, 98, 99])

In [44]:
# return the index of the dataframe that the Spending Score (1-100) equals to 55.
cust.groupby(['Spending Score (1-100)']).groups[55]

Int64Index([46, 61, 70, 81, 87, 90, 103], dtype='int64')

In [45]:
# Let's check the index, which is equal to 46
cust[cust.index == 46]

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
46,47,Female,50,40,55


In [46]:
cust[cust.index == 61]

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
61,62,Male,19,46,55


In [47]:
# The total number of people who gives Spending Score with 55
len(cust.groupby(['Spending Score (1-100)']).groups[55])

7

In [48]:
len(cust.groupby(['Spending Score (1-100)']).groups[15])

3

In [49]:
# Give the multiple condictions.
# This gives, customer who gives the Spending Score equals to 39, then group those people based on there gender, then calculate these two genders average Annual Income
cust[cust['Spending Score (1-100)'] == 39].groupby('Gender')['Annual Income (k$)'].mean()

Gender
Female    99
Male      15
Name: Annual Income (k$), dtype: int64

###### Question? How do we get the mean of annual income of customers based on different genders (Female and Male) <br/>

###### Answer:

In [50]:
cust.groupby("Gender")['Annual Income (k$)'].mean()

Gender
Female    59.250000
Male      62.227273
Name: Annual Income (k$), dtype: float64

In [51]:
cust.groupby(['Gender', 'Age'])['Annual Income (k$)'].mean()

Gender  Age
Female  18      65.000000
        19      64.000000
        20      26.500000
        21      44.750000
        22      37.000000
        23      41.500000
        24      29.500000
        25      72.000000
        27      56.000000
        28      76.000000
        29      72.500000
        30      59.400000
        31      51.714286
        32      76.833333
        33      86.000000
        34      79.666667
        35      45.833333
        36      78.000000
        37      97.000000
        38      80.500000
        40      52.666667
        41     101.000000
        42      34.000000
        43      48.000000
        44      75.500000
        45      69.333333
        46      39.500000
        47      76.750000
        49      49.166667
        50      52.000000
                  ...    
Male    33      77.500000
        34      78.000000
        35      48.333333
        36      87.000000
        37      49.000000
        38      62.500000
        39      72.666667


In [52]:
cust.groupby("Age").agg({"Annual Income (k$)": "mean"})

Unnamed: 0_level_0,Annual Income (k$)
Age,Unnamed: 1_level_1
18,51.25
19,57.0
20,41.6
21,38.8
22,31.333333
23,41.5
24,39.25
25,57.666667
26,58.0
27,63.166667


In [53]:
cust.groupby("Age").agg({"Annual Income (k$)": ['mean', min, max]})

Unnamed: 0_level_0,Annual Income (k$),Annual Income (k$),Annual Income (k$)
Unnamed: 0_level_1,mean,min,max
Age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
18,51.25,33,65
19,57.0,15,81
20,41.6,16,73
21,38.8,15,62
22,31.333333,17,57
23,41.5,16,70
24,39.25,20,60
25,57.666667,24,77
26,58.0,54,62
27,63.166667,40,88


### count & value_counts

In [54]:
# count returns total number of values
cust['Gender'].count()

200

In [55]:
# value_counts return the count of unique items of this column
cust['Gender'].value_counts()

Female    112
Male       88
Name: Gender, dtype: int64

### merge & join

In [62]:
device = pd.read_csv("../data/user_device.csv")

In [64]:
usage = pd.read_csv("../data/user_usage.csv")

In [65]:
device.head(3)

Unnamed: 0,use_id,user_id,platform,platform_version,device,use_type_id
0,22782,26980,ios,10.2,"iPhone7,2",2
1,22783,29628,android,6.0,Nexus 5,3
2,22784,28473,android,5.1,SM-G903F,1


In [66]:
usage.head(3)

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id
0,21.97,4.82,1557.33,22787
1,1710.08,136.88,7267.55,22788
2,1710.08,136.88,7267.55,22789


In [67]:
# by default, the merge is going to be using inner join
result = pd.merge(usage, device, on = 'use_id')

In [68]:
result.head(3)

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,21.97,4.82,1557.33,22787,12921,android,4.3,GT-I9505,1
1,1710.08,136.88,7267.55,22788,28714,android,6.0,SM-G930F,1
2,1710.08,136.88,7267.55,22789,28714,android,6.0,SM-G930F,1


In [69]:
# inner, left, and right merge types
usage['use_id'].isin(device['use_id']).value_counts()

True     159
False     81
Name: use_id, dtype: int64

In [70]:
device['use_id'].isin(usage['use_id']).value_counts()

True     159
False    113
Name: use_id, dtype: int64

In [71]:
# Question-> how many rows our result should have?
result.shape

(159, 9)

In [72]:

left_result = pd.merge(usage, device, on = 'use_id', how = "left")

In [73]:
# Question -> how many rows we have now?
left_result.shape

(240, 9)

In [74]:
right_result = pd.merge(usage, device, on = 'use_id', how = "right")

In [75]:
# Question -> how many rows we have?
right_result.shape

(272, 9)

In [76]:
# Outer Join
outer_result = pd.merge(usage, device, on = 'use_id', how = "outer")

In [77]:
# Question -> how many rows we have now?
outer_result.shape

(353, 9)

In [78]:
result.head(3)

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,21.97,4.82,1557.33,22787,12921,android,4.3,GT-I9505,1
1,1710.08,136.88,7267.55,22788,28714,android,6.0,SM-G930F,1
2,1710.08,136.88,7267.55,22789,28714,android,6.0,SM-G930F,1


In [79]:
# We can also do multiple aggregation
result.groupby("platform").agg({
    "outgoing_mins_per_month": "mean",
    "monthly_mb": "mean",
    "use_id":"count"
})

Unnamed: 0_level_0,outgoing_mins_per_month,monthly_mb,use_id
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
android,201.258535,4221.387834,157
ios,366.06,961.155,2


### Read & Write Files

In [80]:
# Case Example
# Read data from corpus folder:
# Aim: get the name and sentence for each file
import os
from bs4 import BeautifulSoup

In [81]:
name = []
sentence = []

You can never be sure what encoding a file is really using.

For example, a file with the first three bytes 0xEF,0xBB,0xBF is probably a UTF-8 encoded file. However, it might be an ISO-8859-1 file which happens to start with the characters ï»¿. Or it might be a different file type entirely.

In [83]:
for filename in os.listdir("../data/corpus"):
    with open("../data/corpus/" + filename, encoding = "ISO-8859-1") as fp:
        soup = BeautifulSoup(fp, 'xml')
    name_text = soup.find_all("name")
    name.append(name_text)
    
    sentence_text = soup.find_all("sentence")
    sentence.append(sentence_text)

In [84]:
data = pd.DataFrame({'name': name, 'sentence': sentence})

In [86]:
# Save the file to csv file
data.to_csv("hi_sentence.csv")

In [87]:
# Read this file
pd.read_csv("hi_sentence.csv")

Unnamed: 0.1,Unnamed: 0,name,sentence
0,0,"[<name>Australian Liquor, Hospitality and Misc...","[<sentence id=""s0"">\n The nature of the procee..."
1,1,[<name>WorldAudio Limited v Australian Communi...,"[<sentence id=""s0"">\n The circumstances of the..."
2,2,[<name>Commissioner of Taxation v Hornibrook [...,"[<sentence id=""s0"">\n EDMONDS J: \n \n INTRODU..."
3,3,[<name>SZAIX v Minister for Immigration &amp; ...,"[<sentence id=""s0"">\n Introduction \n \n1 This..."
4,4,[<name>Sharman Networks Ltd v Universal Music ...,"[<sentence id=""s0"">\n Background to the curren..."
5,5,[<name>QAAI v Minister for Immigration and Mul...,"[<sentence id=""s0"">\n BACKGROUND \n The Backgr..."
6,6,"[<name>Communications, Electrical, Electronic,...","[<sentence id=""s0"">\n \n1 On 18 November 2005 ..."
7,7,[<name>Comcare v Christina Foster [2006] FCA 6...,"[<sentence id=""s0"">\n THE APPLICATION \n \n 1 ..."


## Multiindex / Advanced Indexing
## Assignments

- Reading Material: https://pandas.pydata.org/pandas-docs/stable/advanced.html
- Data Wrangling: deadline this sunday (Feb 10 2019)