### Working with DataFrames

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
%matplotlib inline

In [2]:
# pass in column names for each csv
path = '/Users/sathisanvannadil/Data/MLData/ml-100k/'
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(path+'u.user', sep='|', names=u_cols, encoding='latin-1')

In [3]:

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(path+'u.data', sep='\t', names=r_cols, encoding='latin-1')

In [4]:

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols

m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv(path+'u.item', sep='|', names=m_cols, usecols=range(5), encoding='latin-1')


### Inspection

In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 5 columns):
movie_id              1682 non-null int64
title                 1682 non-null object
release_date          1681 non-null object
video_release_date    0 non-null float64
imdb_url              1679 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 65.8+ KB


In [6]:
movies.dtypes

movie_id                int64
title                  object
release_date           object
video_release_date    float64
imdb_url               object
dtype: object

In [11]:
movies.describe() # Doesn't make sense

Unnamed: 0,movie_id,video_release_date
count,1682.0,0.0
mean,841.5,
std,485.695893,
min,1.0,
25%,421.25,
50%,841.5,
75%,1261.75,
max,1682.0,


In [12]:
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [13]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
user_id       943 non-null int64
age           943 non-null int64
sex           943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(2), object(3)
memory usage: 36.9+ KB


In [18]:
users.describe()

Unnamed: 0,user_id,age
count,943.0,943.0
mean,472.0,34.051962
std,272.364951,12.19274
min,1.0,7.0
25%,236.5,25.0
50%,472.0,31.0
75%,707.5,43.0
max,943.0,73.0


In [19]:
users.tail()

Unnamed: 0,user_id,age,sex,occupation,zip_code
938,939,26,F,student,33319
939,940,32,M,administrator,2215
940,941,20,M,student,97229
941,942,48,F,librarian,78209
942,943,22,M,student,77841


In [20]:
movies.tail()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998)
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...
1681,1682,Scream of Stone (Schrei aus Stein) (1991),08-Mar-1996,,http://us.imdb.com/M/title-exact?Schrei%20aus%...


In [21]:
movies[20:25]

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
20,21,Muppet Treasure Island (1996),16-Feb-1996,,http://us.imdb.com/M/title-exact?Muppet%20Trea...
21,22,Braveheart (1995),16-Feb-1996,,http://us.imdb.com/M/title-exact?Braveheart%20...
22,23,Taxi Driver (1976),16-Feb-1996,,http://us.imdb.com/M/title-exact?Taxi%20Driver...
23,24,Rumble in the Bronx (1995),23-Feb-1996,,http://us.imdb.com/M/title-exact?Hong%20Faan%2...
24,25,"Birdcage, The (1996)",08-Mar-1996,,"http://us.imdb.com/M/title-exact?Birdcage,%20T..."


### Selecting
Jul 21, 2018

In [23]:
users['occupation'].head()

0    technician
1         other
2        writer
3    technician
4         other
Name: occupation, dtype: object

In [24]:
users[['age', 'occupation']].head()

Unnamed: 0,age,occupation
0,24,technician
1,53,other
2,23,writer
3,24,technician
4,33,other


In [25]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [32]:
users[(users.age<31) & (users.sex=='F')].head(10)

Unnamed: 0,user_id,age,sex,occupation,zip_code
11,12,28,F,other,6405
22,23,30,F,artist,48197
23,24,21,F,artist,94533
31,32,28,F,student,78741
34,35,20,F,homemaker,42459
35,36,19,F,student,93117
37,38,28,F,other,54467
42,43,29,F,librarian,20854
45,46,27,F,marketing,46538
48,49,23,F,student,76111


In [31]:
users[(users.age>21) | (users.sex=='F')].head(10)

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,5201
8,9,29,M,student,1002
9,10,53,M,lawyer,90703


In [36]:
# Set index to user_id
# .set_index() returns a new DataFrame; original DataFrame preserved
# .reset_index() returns a new DataFrame; original DataFrame preserved

users1 = users.set_index('user_id').head()


In [37]:
users1.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [38]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [39]:
usersc = users

In [40]:
usersc.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [41]:
usersc.set_index('user_id', inplace=True)
# inplace parameter

In [42]:
usersc.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [43]:
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [46]:
# Selecting rows by position -- iloc (Purely integer-location based indexing for selection by position)
# zero-based, dispalys 6th row

usersc.iloc[5]

age                  42
sex                   M
occupation    executive
zip_code          98101
Name: 6, dtype: object

In [48]:
usersc.head(10)

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,5201
9,29,M,student,1002
10,53,M,lawyer,90703


In [50]:
usersc.iloc[[2, 5, 7]] # For multi-row selection provide a list

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,23,M,writer,32067
6,42,M,executive,98101
8,36,M,administrator,5201


In [51]:
usersc.loc[[2, 5, 7]] # Note the difference, loc (label-based indexing) instead of iloc (position-based indexing)

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,53,F,other,94043
5,33,F,other,15213
7,57,M,administrator,91344


In [57]:
usersc[usersc.age==53]

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,53,F,other,94043
10,53,M,lawyer,90703
47,53,M,marketing,7102
133,53,M,engineer,78602
144,53,M,programmer,20910
170,53,F,healthcare,30067
185,53,F,librarian,97403
420,53,M,educator,2140
515,53,M,marketing,49508
516,53,F,librarian,10021


In [58]:
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [59]:
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


### Joining
how: {'left', 'right', 'outer', 'inner'}, default 'inner'

In [60]:
left_frame = pd.DataFrame({'key': range(5), 'left_val': ['a', 'b', 'c', 'd', 'e']})
right_frame = pd.DataFrame({'key':range(2,7), 'right_val': ['f', 'g', 'h', 'i', 'j']})

In [61]:
left_frame

Unnamed: 0,key,left_val
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


In [62]:
right_frame

Unnamed: 0,key,right_val
0,2,f
1,3,g
2,4,h
3,5,i
4,6,j


In [73]:
pd.merge(left_frame, right_frame, on='key', how='inner')

Unnamed: 0,key,left_val,right_val
0,2,c,f
1,3,d,g
2,4,e,h


In [81]:
# Left Outer
pd.merge(left_frame, right_frame, on='key', how='left')

Unnamed: 0,key,left_val,right_val
0,0,a,
1,1,b,
2,2,c,f
3,3,d,g
4,4,e,h


In [82]:
# Right Outer
pd.merge(left_frame, right_frame, on='key', how='right')

Unnamed: 0,key,left_val,right_val
0,2,c,f
1,3,d,g
2,4,e,h
3,5,,i
4,6,,j


In [83]:
# Full Outer
pd.merge(left_frame, right_frame, on='key', how='outer')

Unnamed: 0,key,left_val,right_val
0,0,a,
1,1,b,
2,2,c,f
3,3,d,g
4,4,e,h
5,5,,i
6,6,,j


### Combining  using concat
Similar to SQL's UNION

In [89]:
pd.concat([left_frame, right_frame], axis=0, sort=True, ignore_index=True) # Vertically append, default

Unnamed: 0,key,left_val,right_val
0,0,a,
1,1,b,
2,2,c,
3,3,d,
4,4,e,
5,2,,f
6,3,,g
7,4,,h
8,5,,i
9,6,,j


In [86]:
pd.concat([left_frame, right_frame], axis=1, sort=True) # Horizontally append

Unnamed: 0,key,left_val,key.1,right_val
0,0,a,2,f
1,1,b,3,g
2,2,c,4,h
3,3,d,5,i
4,4,e,6,j
