In [31]:
from jupyterthemes import jtplot
jtplot.style()

# Practical Data Transformation and Analysis with Pandas
### Zong-han, Xie

# Speaker
* Zong-han, Xie
* Majored in physics
* Previously a C++ developer developing simulation software for LCD.
* Currently working for Micron Memory Taiwan, building home-made B.I. system.
* Email: icbm0926@gmail.com

# This talk is intended for
* People who have needs to perform complex data ETL with Python

# Outline
* Basic Data Structures: 
  - Create Pandas Series and DataFrame
  - Read our demo data
  - Indexing in DataFrame and Pandas Series
  - SettingsWithCopy Warning
* Text Handling with Pandas
  - Using "str" attributes to handle string and using regexp with it
* Merging and Concatenating tables
  - Concept of merging two tables (inner join, left/right join, outer join)
  - Concatenating tables
* Split-Apply-Combine strategy
  - Process Flow
  - GroupBy object
  - GroupBy.transform
  - GroupBy.apply
  - GroupBy.aggregate
* A small example

In [1]:
# %load Extract_MovieLens_Data.py


# # This notebook is to extract data from Movie Lens
# * The data contents are explained in http://files.grouplens.org/papers/ml-1m-README.txt
# 
# ## users.dat
# 
# UserID::Gender::Age::Occupation::Zip-code
# - Gender is denoted by a "M" for male and "F" for female
# - Age is chosen from the following ranges:
# 
# 	*  1:  "Under 18"
# 	* 18:  "18-24"
# 	* 25:  "25-34"
# 	* 35:  "35-44"
# 	* 45:  "45-49"
# 	* 50:  "50-55"
# 	* 56:  "56+"
# 
# - Occupation is chosen from the following choices:
# 
# 	*  0:  "other" or not specified
# 	*  1:  "academic/educator"
# 	*  2:  "artist"
# 	*  3:  "clerical/admin"
# 	*  4:  "college/grad student"
# 	*  5:  "customer service"
# 	*  6:  "doctor/health care"
# 	*  7:  "executive/managerial"
# 	*  8:  "farmer"
# 	*  9:  "homemaker"
# 	* 10:  "K-12 student"
# 	* 11:  "lawyer"
# 	* 12:  "programmer"
# 	* 13:  "retired"
# 	* 14:  "sales/marketing"
# 	* 15:  "scientist"
# 	* 16:  "self-employed"
# 	* 17:  "technician/engineer"
# 	* 18:  "tradesman/craftsman"
# 	* 19:  "unemployed"
# 	* 20:  "writer"
# 
# ## movies.dat
# MovieID::Title::Genres
# 
# ## ratings.dat
# UserID::MovieID::Rating::Timestamp

# In[1]:

import pandas as pd


# In[3]:

users_df = pd.read_csv("./ml-1m/users.dat"
                    , sep='::'
                    , header=None
                    , names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
ocupation_codes = {'ocupation_code': [x for x in range(21)]
                   , 'Occupation_name': ["other or not specified", "academic/educator", "artist"
                                  , "clerical/admin", "college/grad student", "customer service"
                                  , "doctor/health care", "executive/managerial", "farmer"
                                  , "homemaker", "K-12 student", "lawyer", "programmer", "retired"
                                  , "sales/marketing" ,"scientist", "self-employed", "technician/engineer"
                                  , "tradesman/craftsman", "unemployed", "writer"]
                  }
ocupation_codes = pd.DataFrame(ocupation_codes)
users_df = users_df.merge(ocupation_codes, left_on=["Occupation"], right_on=["ocupation_code"], how='left')
users_df = users_df.drop(["Occupation", "ocupation_code"], axis=1).rename(columns={'Occupation_name': 'Occupation'})


# In[4]:

movies_org_df = pd.read_csv("./ml-1m/movies.dat"
                            , sep='::'
                            , header=None
                            , names=["MovieID", "Title", "Genres"])
rows = []
for _, row in movies_org_df.iterrows():
    for gen in row.Genres.split('|'):
        rows.append([row['MovieID'], row['Title'], gen])
movies_df = pd.DataFrame(rows, columns=movies_org_df.columns)


# In[5]:

ratings_df = pd.read_csv("./ml-1m/ratings.dat"
                         , sep='::'
                         , header=None
                         , names=["UserID", "MovieID", "Rating", "Timestamp"])
ratings_df['rating_dt'] = pd.to_datetime(ratings_df['Timestamp'],unit='s')





In [1]:
import numpy as np
print(users_df.columns)
print(movies_df.columns)
print(ratings_df.columns)

NameError: name 'users_df' is not defined

In [3]:
all_df = ratings_io9df.merge(users_df, on=['UserID'], how='left').merge(movies_df, on=['MovieID'], how='left')

# DataFrame Indexing

In [22]:
all_df.UserID.describe(all_df.MovieID, lambda x, y: x + y)


count    2.101815e+06
mean     3.026197e+03
std      1.731014e+03
min      1.000000e+00
25%      1.505000e+03
50%      3.075000e+03
75%      4.478000e+03
max      6.040000e+03
Name: UserID, dtype: float64

In [4]:
part_users_df = users_df.ix[[2,4,8]]
part_users_df.loc[2, 'Gender'] = 'F'
print(part_users_df)

   UserID Gender  Age Zip-code           Occupation
2       3      F   25    55117            scientist
4       5      M   25    55455               writer
8       9      M   25    61614  technician/engineer


In [5]:
part_users_df.set_index("UserID")

Unnamed: 0_level_0,Gender,Age,Zip-code,Occupation
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,F,25,55117,scientist
5,M,25,55455,writer
9,M,25,61614,technician/engineer


# Text Handling

In [6]:
movies_org_df.ix[0:10]

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [7]:
movies_df[movies_df.Title.str.contains('^Old.*')]

Unnamed: 0,MovieID,Title,Genres
1309,797,"Old Lady Who Walked in the Sea, The (Vieille q...",Comedy
1657,1012,Old Yeller (1957),Children's
1658,1012,Old Yeller (1957),Drama
1791,1085,"Old Man and the Sea, The (1958)",Adventure
1792,1085,"Old Man and the Sea, The (1958)",Drama


# Merge Data and Concat Data
* Pandas.merge and Pandas.concat
* Pandas.merger is analog to join in SQL

# Split - Apply - Combine Strategy
The basic cooncept of split-apply-combine strategy 
* Split
    - Split original data into groups
* Apply
    - Apply functions to data within each group independently
* Combine
    - Merge results into a data structure

# Data Aggregation

```
select 
max(Ratings.Rating)
from Ratings
group by Ratings.MovieId
```

```
df.assign(max_ratings = ratings_df.groupby("MovieID")['Rating'].transform(np.max))
```

* Data aggregation in Pandas uses GroupBy.apply, GroupBy.transform and GroupBy.aggregate.
* These functions are badly documented in Pandas dicumentation.

# GroupBy objects

In [8]:
all_df.groupby(["Occupation", "Genres"])

<pandas.core.groupby.DataFrameGroupBy object at 0x7fde459c29b0>

In [9]:
grouped_ratings = all_df.groupby(["Occupation", "Genres"])
for key, group_df in grouped_ratings:
    print("group keys: " + str(key))
    print(group_df.iloc[:5])
    break

group keys: ('K-12 student', 'Action')
     UserID  MovieID  Rating  Timestamp           rating_dt Gender  Age  \
10        1     1197       3  978302268 2000-12-31 22:37:48      F    1   
14        1     1287       5  978302039 2000-12-31 22:33:59      F    1   
90        1     2692       4  978301570 2000-12-31 22:26:10      F    1   
93        1      260       4  978300760 2000-12-31 22:12:40      F    1   
104       1     2028       5  978301619 2000-12-31 22:26:59      F    1   

    Zip-code    Occupation                                      Title  Genres  
10     48067  K-12 student                 Princess Bride, The (1987)  Action  
14     48067  K-12 student                             Ben-Hur (1959)  Action  
90     48067  K-12 student           Run Lola Run (Lola rennt) (1998)  Action  
93     48067  K-12 student  Star Wars: Episode IV - A New Hope (1977)  Action  
104    48067  K-12 student                 Saving Private Ryan (1998)  Action  


# Let's manually aggregate function

In [10]:
results = {'Occupation': [], 'Genres': [], 'Rating_mean':[]}
grouped_ratings = all_df.groupby(["Occupation", "Genres"])
for key, group_df in grouped_ratings:
    results['Occupation'].append(key[0])
    results['Genres'].append(key[1])
    results['Rating_mean'].append(group_df.Rating.mean())
pd.DataFrame(results).ix[0:10]

Unnamed: 0,Genres,Occupation,Rating_mean
0,Action,K-12 student,3.497116
1,Adventure,K-12 student,3.425658
2,Animation,K-12 student,3.463956
3,Children's,K-12 student,3.220679
4,Comedy,K-12 student,3.4972
5,Crime,K-12 student,3.687085
6,Documentary,K-12 student,3.581633
7,Drama,K-12 student,3.782167
8,Fantasy,K-12 student,3.298039
9,Film-Noir,K-12 student,4.212766


In [30]:
import numpy as np
tmp = all_df[all_df.Occupation == 'K-12 student'].copy()
tmp.loc[:,'Rating_mean'] = tmp.groupby(["Occupation", "Genres"])['Rating'].transform(np.mean)
print(tmp[['Occupation', 'Genres', 'Rating_mean']].sort_values(by='Genres').iloc[1:10])

           Occupation  Genres  Rating_mean
804564   K-12 student  Action     3.497116
363454   K-12 student  Action     3.497116
1806560  K-12 student  Action     3.497116
772492   K-12 student  Action     3.497116
1806556  K-12 student  Action     3.497116
659674   K-12 student  Action     3.497116
706974   K-12 student  Action     3.497116
827117   K-12 student  Action     3.497116
363460   K-12 student  Action     3.497116


** GroupBy.transform() returns a Pandas Series with the same index as those in original DataFrame **

** Therefore, it's easy to combine data back to the original data. **

In [12]:
print(tmp[['Occupation', 'Genres', 'Rating_mean']].sort_values(by='Genres').drop_duplicates().iloc[0:10])

           Occupation       Genres  Rating_mean
418625   K-12 student       Action     3.497116
831363   K-12 student    Adventure     3.425658
136334   K-12 student    Animation     3.463956
453973   K-12 student   Children's     3.220679
789324   K-12 student       Comedy     3.497200
692103   K-12 student        Crime     3.687085
1931149  K-12 student  Documentary     3.581633
790387   K-12 student        Drama     3.782167
521989   K-12 student      Fantasy     3.298039
364059   K-12 student    Film-Noir     4.212766


In [13]:
all_df.groupby(['Occupation', 'Genres'])['Rating'].mean()

Occupation         Genres     
K-12 student       Action         3.497116
                   Adventure      3.425658
                   Animation      3.463956
                   Children's     3.220679
                   Comedy         3.497200
                   Crime          3.687085
                   Documentary    3.581633
                   Drama          3.782167
                   Fantasy        3.298039
                   Film-Noir      4.212766
                   Horror         3.237795
                   Musical        3.556738
                   Mystery        3.636612
                   Romance        3.624415
                   Sci-Fi         3.443795
                   Thriller       3.554131
                   War            3.880144
                   Western        3.513333
academic/educator  Action         3.392063
                   Adventure      3.424278
                   Animation      3.693399
                   Children's     3.459286
                   Come

In [14]:
all_df.groupby(['Occupation', 'Genres'])['Rating'].agg(np.mean).ix['K-12 student']

Genres
Action         3.497116
Adventure      3.425658
Animation      3.463956
Children's     3.220679
Comedy         3.497200
Crime          3.687085
Documentary    3.581633
Drama          3.782167
Fantasy        3.298039
Film-Noir      4.212766
Horror         3.237795
Musical        3.556738
Mystery        3.636612
Romance        3.624415
Sci-Fi         3.443795
Thriller       3.554131
War            3.880144
Western        3.513333
Name: Rating, dtype: float64

In [15]:
all_df.groupby(['Occupation', 'Genres'])['Rating'].agg(np.mean).reset_index().ix[0:5]

Unnamed: 0,Occupation,Genres,Rating
0,K-12 student,Action,3.497116
1,K-12 student,Adventure,3.425658
2,K-12 student,Animation,3.463956
3,K-12 student,Children's,3.220679
4,K-12 student,Comedy,3.4972
5,K-12 student,Crime,3.687085


In [27]:
all_df.groupby(['Occupation', 'Genres']).agg({'Rating': np.mean}).reset_index().ix[0:5]

Unnamed: 0,Occupation,Genres,Rating
0,K-12 student,Action,3.497116
1,K-12 student,Adventure,3.425658
2,K-12 student,Animation,3.463956
3,K-12 student,Children's,3.220679
4,K-12 student,Comedy,3.4972
5,K-12 student,Crime,3.687085
