In [3]:
# -*- coding: utf-8 -*-
__author__ = 'virginiedesharnais'

import pandas as pd
import numpy as np


In [4]:
# Load Data
userHeader = ['user_id', 'gender', 'age', 'ocupation', 'zip']
users = pd.read_csv('dataSet/users.txt', engine='python',
                    sep='::', header=None, names=userHeader, encoding='ISO-8859-1')

movieHeader = ['movie_id', 'title', 'genders']
movies = pd.read_csv('dataSet/movies.txt', engine='python',
                     sep='::', header=None, names=movieHeader, encoding='ISO-8859-1')

ratingHeader = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('dataSet/ratings.txt', engine='python',
                      sep='::', header=None, names=ratingHeader, encoding='ISO-8859-1')


In [5]:
# Merge data
mergeRatings = pd.merge(pd.merge(users, ratings), movies)


In [6]:
# Clone DataFrame


def cloneDF(df):
    a = pd.DataFrame(df.values.copy(), df.index.copy(), df.columns.copy())
    return a.apply(pd.to_numeric, errors = 'ignore')

In [7]:

# Show Films with more votes. (groupby + sorted)
numberRatings = cloneDF(mergeRatings)
numberRatings = numberRatings.groupby(
    'title').size().sort_values(ascending=False)
print('Films with more votes: \n%s' % numberRatings[:10])
print('\n==================================================================\n')


Films with more votes: 
title
American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
dtype: int64




In [8]:

# Show avg ratings movie (groupby + avg)
avgRatings = cloneDF(mergeRatings)
avgRatings = avgRatings.groupby(['movie_id', 'title']).mean()
print('Avg ratings: \n%s' % avgRatings['rating'][:10])
print('\n==================================================================\n')


Avg ratings: 
movie_id  title                             
1         Toy Story (1995)                      4.146846
2         Jumanji (1995)                        3.201141
3         Grumpier Old Men (1995)               3.016736
4         Waiting to Exhale (1995)              2.729412
5         Father of the Bride Part II (1995)    3.006757
6         Heat (1995)                           3.878723
7         Sabrina (1995)                        3.410480
8         Tom and Huck (1995)                   3.014706
9         Sudden Death (1995)                   2.656863
10        GoldenEye (1995)                      3.540541
Name: rating, dtype: float64




In [9]:

# Show data ratings movies (groupby + several funtions)
dataRatings = cloneDF(mergeRatings)
dataRatings = dataRatings.groupby(['movie_id', 'title'])[
    'rating'].agg(['mean', 'sum', 'count', 'std'])
print('Films ratings info: \n%s' % dataRatings[:10])
print('\n==================================================================\n')


Films ratings info: 
                                                 mean   sum  count       std
movie_id title                                                              
1        Toy Story (1995)                    4.146846  8613   2077  0.852349
2        Jumanji (1995)                      3.201141  2244    701  0.983172
3        Grumpier Old Men (1995)             3.016736  1442    478  1.071712
4        Waiting to Exhale (1995)            2.729412   464    170  1.013381
5        Father of the Bride Part II (1995)  3.006757   890    296  1.025086
6        Heat (1995)                         3.878723  3646    940  0.934588
7        Sabrina (1995)                      3.410480  1562    458  0.979918
8        Tom and Huck (1995)                 3.014706   205     68  0.954059
9        Sudden Death (1995)                 2.656863   271    102  1.048290
10       GoldenEye (1995)                    3.540541  3144    888  0.891233




In [10]:


# Show data ratings movies, applying a function (groupby + lambda function)
myAvg = cloneDF(mergeRatings)
myAvg = myAvg.groupby(['movie_id', 'title'])['rating'].agg(
    SUM=np.sum, COUNT=np.size, AVG=np.mean, myAVG=lambda x: x.sum() / float(x.count()))
print('My info ratings: \n%s' % myAvg[:10])
print('\n==================================================================\n')


My info ratings: 
                                              SUM  COUNT       AVG     myAVG
movie_id title                                                              
1        Toy Story (1995)                    8613   2077  4.146846  4.146846
2        Jumanji (1995)                      2244    701  3.201141  3.201141
3        Grumpier Old Men (1995)             1442    478  3.016736  3.016736
4        Waiting to Exhale (1995)             464    170  2.729412  2.729412
5        Father of the Bride Part II (1995)   890    296  3.006757  3.006757
6        Heat (1995)                         3646    940  3.878723  3.878723
7        Sabrina (1995)                      1562    458  3.410480  3.410480
8        Tom and Huck (1995)                  205     68  3.014706  3.014706
9        Sudden Death (1995)                  271    102  2.656863  2.656863
10       GoldenEye (1995)                    3144    888  3.540541  3.540541




In [16]:

# Sort data ratings by created field (groupby + lambda function + sorted)
SehijaEil = cloneDF(mergeRatings)
SehijaEil = SehijaEil.groupby(['movie_id', 'title'])['rating'].agg(
    COUNT=np.size, myAVG=lambda x: x.sum() / float(x.count())).sort_values('COUNT', ascending=False)
print('My info sorted: \n%s' % SehijaEil[:15])


My info sorted: 
                                                             COUNT     myAVG
movie_id title                                                              
2858     American Beauty (1999)                               3428  4.317386
260      Star Wars: Episode IV - A New Hope (1977)            2991  4.453694
1196     Star Wars: Episode V - The Empire Strikes Back ...   2990  4.292977
1210     Star Wars: Episode VI - Return of the Jedi (1983)    2883  4.022893
480      Jurassic Park (1993)                                 2672  3.763847
2028     Saving Private Ryan (1998)                           2653  4.337354
589      Terminator 2: Judgment Day (1991)                    2649  4.058513
2571     Matrix, The (1999)                                   2590  4.315830
1270     Back to the Future (1985)                            2583  3.990321
593      Silence of the Lambs, The (1991)                     2578  4.351823
1580     Men in Black (1997)                               

In [15]:
#on a changé sort par sort_values