In [2]:
# all imports
from IPython.display import HTML
import numpy as np
import urllib2
import bs4 #this is beautiful soup
import time
import operator
import socket
import cPickle
import re # regular expressions

from pandas import Series
import pandas as pd
from pandas import DataFrame

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_context("talk")
sns.set_style("white")

from secret import *

Pandas: MovieLens Data
===================

http://grouplens.org/datasets/movielens/

Example inspired by [Greg Reda](http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/)
### User Dataset

In [3]:
# load in data set from group lens server

# data is just values, not labelled - so add em into a list manually
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

# sep: data is separated by '|'
users = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.user', 
    sep='|', names=u_cols)

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Ratings Dataset

In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.data', 
    sep='\t', names=r_cols)

ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### Movies Dataset

In [5]:
# data set also contains columns indicating the movie's genres, which we don't need
# so, only load the first five columns of the file with usecols

m_cols = ['movie_id', 'title', 'release_date', 
            'video_release_date', 'imdb_url']

movies = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.item', 
    sep='|', names=m_cols, usecols=range(5))

movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


### Gather info about the data

In [7]:
print movies.dtypes

movie_id                int64
title                  object
release_date           object
video_release_date    float64
imdb_url               object
dtype: object


In [9]:
# pandas func to get basic stats on each col
# only works with primitive types tho
print movies.describe()

          movie_id  video_release_date
count  1682.000000                 0.0
mean    841.500000                 NaN
std     485.695893                 NaN
min       1.000000                 NaN
25%     421.250000                 NaN
50%     841.500000                 NaN
75%    1261.750000                 NaN
max    1682.000000                 NaN


### Selecting data

* DataFrame => group of Series with shared index
* single DataFrame column => Series

In [16]:
users.head()

   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213


In [15]:
# select single col
users['occupation'].head()

0    technician
1         other
2        writer
3    technician
4         other
Name: occupation, dtype: object


In [17]:
# create a subframe filter of multiple cols
columns_you_want = ['occupation', 'sex'] 
print users[columns_you_want].head()

   occupation sex
0  technician   M
1       other   F
2      writer   M
3  technician   M
4       other   F


In [18]:
# iloc[] to look into a given row
print users.iloc[3]

user_id                4
age                   24
sex                    M
occupation    technician
zip_code           43537
Name: 3, dtype: object


### Conditional Filtering

In [19]:
# select users older than 25
oldUsers = users[users.age > 25]
oldUsers.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
1,2,53,F,other,94043
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,5201


In [36]:
# Q: show users age 40 and male
male40 = users[users.age == 40][users.sex == 'M'] 
male40.head()

  from ipykernel import kernelapp as app


Unnamed: 0,user_id,age,sex,occupation,zip_code
18,19,40,M,librarian,2138
82,83,40,M,other,44133
115,116,40,M,healthcare,97232
199,200,40,M,programmer,93402
283,284,40,M,executive,92629


In [37]:
# Q: show the mean age of female programmers
fProgrammers = users[(users.sex == 'F') & (users.occupation == 'programmer')] # bitwise &
print "Mean", fProgrammers.age.mean()

Mean 32.1666666667


### Split-apply-combine

<img src=http://i.imgur.com/yjNkiwL.png></img>
* split the data into groups based on some criteria
* apply a function to each group independently
* combine the results into a data structure

### Find Diligent Users

* split data per user ID
* count ratings
* combine result

In [41]:
print ratings.head()

# split data
grouped_data = ratings['movie_id'].groupby(ratings['user_id'])

# Note: "grouped_data = ratings.groupby('user_id')" will give counts for each of 4 series in entire df

# apply count function and combine
ratings_per_user = grouped_data.count()

ratings_per_user.head(5)

   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596


user_id
1    272
2     62
3     54
4     24
5    175
Name: movie_id, dtype: int64

Output isn't formatted nice like before, Why?
We're dealing with a series now (not an entire dataframe).

In [46]:
# QUIZ: get the average rating per movie

# Get all ratings and group by movie_id
grouped_data = ratings['rating'].groupby(ratings['movie_id'])

# Apply mean function
avg_per_movie = grouped_data.mean()

print "Average Ratings"
avg_per_movie.head()

Average Ratings


movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64

In [53]:
# QUIZ: advanced: get the movie titles with the highest average rating

# Some kinda sort
# "sorted(avg_per_movie, reverse=true)" # hmmm, there are multiple titles with same max rating

# So, group all ratings by movie_id and apply function that filters out max

max_rating = avg_per_movie.max()
good_movies = avg_per_movie[avg_per_movie == max_rating]
print good_movies

movie_id
814     5.0
1122    5.0
1189    5.0
1201    5.0
1293    5.0
1467    5.0
1500    5.0
1536    5.0
1599    5.0
1653    5.0
Name: rating, dtype: float64


In [61]:
# Problem: we lost data about the movie title when we split the df by rating
# So, we store the movie_ids and do a lookup in the movie df (i.e. filter)

# "good_movie_ids = good_movies.movie_id" -> Wrong, since we're in a series

good_movie_ids = good_movies.index
result = movies[movies.movie_id.isin(good_movie_ids)].title

print result

813                         Great Day in Harlem, A (1994)
1121                       They Made Me a Criminal (1939)
1188                                   Prefontaine (1997)
1200           Marlene Dietrich: Shadow and Light (1996) 
1292                                      Star Kid (1997)
1466                 Saint of Fort Washington, The (1993)
1499                            Santa with Muscles (1996)
1535                                 Aiqing wansui (1994)
1598                        Someone Else's America (1995)
1652    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object


In [79]:
# Intuition: Could the high average be because very few ppl rated these titles in the first place

how_many_ratings = grouped_data.count()
print "Number of ratings per movie"
print how_many_ratings[avg_per_movie == max_rating]

Number of ratings per movie
movie_id
814     1
1122    1
1189    3
1201    1
1293    3
1467    2
1500    2
1536    1
1599    1
1653    1
Name: rating, dtype: int64


### Can pass a custom function to apply to groupby object 

In [65]:
avg_per_movie = grouped_data.apply(lambda f: f.mean())
avg_per_movie.head()

movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64

In [77]:
# QUIZ: advanced: list all occupations and if they are male or female dominant

# Group M/F by Occupation
grouped_occs = users['sex'].groupby(users['occupation'])

# Apply function that compares count of males to females
m_dominant = grouped_occs.apply(lambda f: sum(f == 'M') > sum(f == 'F')) 

# f arg is 'sex', what we split the df into. 
# f is NOT an object containing occupations and sexes. 
# f is NOT the index i.e. occupation by which data was grouped

print "Male Dominant:\n", m_dominant

Male Dominant:
occupation
administrator     True
artist            True
doctor            True
educator          True
engineer          True
entertainment     True
executive         True
healthcare       False
homemaker        False
lawyer            True
librarian        False
marketing         True
none              True
other             True
programmer        True
retired           True
salesman          True
scientist         True
student           True
technician        True
writer            True
Name: sex, dtype: bool


In [78]:
# Intuition: Could the result be skewed by having more users of a particular sex report any occupation?
print 'number of male users: '
print sum(users['sex'] == 'M')

print 'number of female users: '
print sum(users['sex'] == 'F')

number of male users: 
670
number of female users: 
273


Python data scraping
====================