In [None]:
import pandas as pd # pandas is a data manipulation library
import numpy as np #provides numerical arrays and functions to manipulate the arrays efficiently
import random
import matplotlib.pyplot as plt # data visualization library
import operator

In [None]:
m_cols = ['movie_id','movie title','release date','video release date','IMDb URL'
          ,'unknown','Action','Adventure','Animation','Childrens','Comedy',
          'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
        'Thriller','War','Western']
data_u_item = pd.read_csv('ml-100k/u.item',delimiter='|',names=m_cols,encoding='latin-1')

In [None]:
data_u_item.head()

In [None]:
data_u_item.info()

In [None]:
#explore users
u_cols = ['user_id','age','gender','occupation','zip_code']
data_u_users = pd.read_csv('ml-100k/u.user',delimiter='|',names=u_cols,encoding='latin-1')

In [None]:
data_u_users.head()

In [None]:
#explore ratings
r_cols = ['user_id','movie_id','rating','timestamp']
data_rating_u1_base = pd.read_csv('ml-100k/u1.base',delimiter='\t',names=r_cols,encoding='latin-1')

In [None]:
data_rating_u1_base.head()

In [None]:
# merge movies and rating data
movies_ratings_data = data_u_item.merge(data_rating_u1_base,on = 'movie_id',how = 'inner')

# 2

## a

In [None]:
def movies_by_mean_ratings(movies_and_ratings_data):
    movies_mean_ratings = movies_and_ratings_data[['movie_id','movie title','rating']].groupby(['movie_id','movie title']).mean()
    movies_mean_ratings = movies_mean_ratings.sort_values(['rating'],ascending=False)
    return movies_mean_ratings

In [None]:
print movies_by_mean_ratings(movies_ratings_data).head(3)

## b

In [None]:
def movies_by_count_ratings(movies_and_ratings_data):
    movies_ratings_count = movies_and_ratings_data[['movie_id','movie title','rating']].groupby(['movie_id','movie title']).count()
    movies_ratings_count = movies_ratings_count.sort_values(['rating'],ascending=False)
    return movies_ratings_count

In [None]:
print movies_by_count_ratings(movies_ratings_data).head(3)

## c

In [None]:
def movies_by_rating_percent(movies_and_ratings_data):
    count_rating = movies_and_ratings_data[['movie_id','movie title','rating']].groupby(['movie_id','movie title']).count()
    count_rating = count_rating.reset_index()
    
    count_rating_big_than_4 = movies_and_ratings_data[['movie_id','movie title','rating']][movies_and_ratings_data.rating > 4].groupby(['movie_id','movie title']).count()
    count_rating_big_than_4 = count_rating_big_than_4.reset_index()
    
    movies_by_rating_percent  = pd.DataFrame(columns = ['movie_id','movie title','rating_percent'])
    for index,row in count_rating_big_than_4.iterrows():
        movie_id = row.movie_id
        movie_title = row['movie title']
        movie_rate_count = row.rating

        total_rating = count_rating.loc[count_rating['movie_id'] == movie_id].rating.values[0]
        rating_percent = (float(movie_rate_count) / total_rating) * 100

        movie = pd.DataFrame([[movie_id, movie_title, int(rating_percent)]], columns = ['movie_id','movie title','rating_percent'])
        movies_by_rating_percent = movies_by_rating_percent.append(movie, ignore_index=True)

    movies_by_rating_percent = movies_by_rating_percent.sort_values(['rating_percent'],ascending=False)
    return movies_by_rating_percent

In [None]:
print movies_by_rating_percent(movies_ratings_data).head(3)

## d

In [None]:
import math
def new_rating_cala(mean,count):
    new = (math.log10(count) + 1) * mean
    new = (new / 5) + 1.78718
    return new

In [None]:
def movies_by_new_ratings(movies_and_ratings_data):
    new_ratings  = pd.DataFrame(columns = ['movie_id','movie title','rating'])

    mean_ratings = movies_by_mean_ratings(movies_and_ratings_data).reset_index()
    count_ratings = movies_by_count_ratings(movies_and_ratings_data).reset_index()
    for index,row in count_ratings.iterrows():
        movie_id = row.movie_id
        movie_title = row['movie title']
        movie_rate_count = row.rating
        movie_rate_mean = mean_ratings.loc[mean_ratings['movie_id'] == movie_id].rating.values[0]

        movie = pd.DataFrame([[movie_id, movie_title, new_rating_cala(movie_rate_mean,movie_rate_count)]], columns = ['movie_id','movie title','rating'])
        new_ratings = new_ratings.append(movie)

    new_ratings = new_ratings.sort_values(['rating'],ascending=False)
    return new_ratings

In [None]:
print movies_by_new_ratings(movies_ratings_data).head(3)

## e

In [None]:
#merge movies ratings data with users
movie_ratings_data_and_users = movies_ratings_data.merge(data_u_users,on = 'user_id',how = 'inner')

## MANS

In [None]:
mans_rating = movie_ratings_data_and_users[['movie_id','movie title','rating']][movie_ratings_data_and_users.gender == 'M']

In [None]:
mans_rating.head()

## e, a

In [None]:
print movies_by_mean_ratings(mans_rating).head(3)

## e, b

In [None]:
print movies_by_count_ratings(mans_rating).head(3)

## e, c

In [None]:
print movies_by_rating_percent(mans_rating).head(3)

## e, d

In [None]:
print movies_by_new_ratings(mans_rating).head(3)

## womans

In [None]:
woman_ratings = movie_ratings_data_and_users[['movie_id','movie title','rating']][movie_ratings_data_and_users.gender == 'F']

In [None]:
woman_ratings.head()

## e, a

In [None]:
print movies_by_mean_ratings(woman_ratings).head(3)

## e, b

In [None]:
print movies_by_count_ratings(woman_ratings).head(3)

## e, c

In [None]:
print movies_by_rating_percent(woman_ratings).head(3)

## e, d

In [None]:
print movies_by_new_ratings(woman_ratings).head(3)

## Conclusions:

### Are there differences in mean values between two populations?

In [None]:
mans_rating[['rating']].mean()

In [None]:
woman_ratings[['rating']].mean()

### What are the three most popular movies among women?

In [None]:
print movies_by_new_ratings(woman_ratings).head(3)

### What are the three most popular movies among men?

In [None]:
print movies_by_new_ratings(mans_rating).head(3)

### Who have the highest gap between men and women?

#### abs:

#### high in men:

#### hign in women:

#### common ground:

# 3

## a

### build model

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_absolute_error

#Generic function for making a classification model and accessing performance:
def classification_model(model, train,test, predictors, outcome):
    #Fit the model:
    predictions = model.fit(train[predictors],train[outcome]).predict(test[predictors])

    #Print accuracy
    accuracy = metrics.accuracy_score(predictions,test[outcome])
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))

    print "MAE: {}".format(mean_absolute_error(test[outcome], predictions))

### prepare u1.test

In [None]:
r_cols = ['user_id','movie_id','rating','timestamp']
test = pd.read_csv('ml-100k/u1.test',delimiter='\t',names=r_cols,encoding='latin-1')
test = test[['movie_id','rating']].groupby('movie_id').mean()
test = test.reset_index()
test.head()

In [None]:
# merge u1.test (ranking) with movies
test = data_u_item.merge(test,on = 'movie_id',how = 'inner')
test = test.drop(['movie title', 'video release date', 'IMDb URL'], axis=1)

In [None]:
test.head()

In [None]:
# need to change value type. for rating and release date
from sklearn.preprocessing import LabelEncoder
def factrozied(data):
    le = LabelEncoder()
    data['rating'] = le.fit_transform(data['rating'].astype(long)) + 1
    data['release date'] = le.fit_transform(data['release date'].astype(str))
    return data

In [None]:
test = factrozied(test)
test.head(3)

## AdaBoost

In [None]:
def prepare_train(train):
    train = train.reset_index()
    train = data_u_item.merge(train,on = ['movie_id','movie title'],how = 'inner')
    train = train.drop(['movie title', 'video release date', 'IMDb URL'], axis=1)
    train = factrozied(train)
    return train

### 1: using Q2 - a

In [None]:
# prepare train:
mean_movie_ratings = movies_by_mean_ratings(movies_ratings_data)
mean_movie_ratings = prepare_train(mean_movie_ratings)
mean_movie_ratings.head(3)

In [None]:
outcome_var = ['rating']
model = AdaBoostClassifier(n_estimators=20)
predictor_var = ['release date','unknown','Action','Adventure','Animation','Childrens','Comedy',
          'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
        'Thriller','War','Western']
classification_model(model, mean_movie_ratings,test,predictor_var,outcome_var)

### 2: using Q2 - d

In [None]:
# prepare train:
mean_my_movie_ratings = movies_by_new_ratings(movies_ratings_data)
mean_my_movie_ratings = prepare_train(mean_my_movie_ratings)
mean_my_movie_ratings.head(3)

In [None]:
outcome_var = ['rating']
model = AdaBoostClassifier(n_estimators=20)
predictor_var = ['release date','unknown','Action','Adventure','Animation','Childrens','Comedy',
          'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
        'Thriller','War','Western']
classification_model(model, mean_my_movie_ratings,test,predictor_var,outcome_var)

# Q3, b

In [None]:
data_rating_u1_base.head(3)

In [None]:
from collections import defaultdict
all_user = {}
all_user = defaultdict(list)
for index,row in data_rating_u1_base.iterrows():
    all_user[row[0]].append(row[1])

In [None]:
def calc_recall_precision():
    recall = 0
    precision = 0
    return recall,precision

In [217]:
movies_rate_a = movies_by_mean_ratings(movies_ratings_data).reset_index()['movie_id'].values.tolist()
movies_rate_a = map(lambda x: int(x),movies_rate_a)

In [218]:
movies_rate_b = movies_by_count_ratings(movies_ratings_data).reset_index()['movie_id'].values.tolist()
movies_rate_b = map(lambda x: int(x),movies_rate_b)

In [219]:
movies_rate_c = movies_by_rating_percent(movies_ratings_data).reset_index()['movie_id'].values.tolist()
movies_rate_c = map(lambda x: int(x),movies_rate_c)

In [220]:
movies_rate_d = movies_by_new_ratings(movies_ratings_data).reset_index()['movie_id'].values.tolist()
movies_rate_d = map(lambda x: int(x),movies_rate_d)

In [221]:
movies_rate_random = movies_ratings_data.reset_index()['movie_id'].values.tolist()
movies_rate_random = map(lambda x: int(x),movies_rate_random)

In [None]:
all_users_recomendations = {}
for user_id, movie_list in all_user.iteritems():
    new_movies_rate_a = [item for item in movies_rate_a if item not in movie_list][0:20]
    new_movies_rate_b = [item for item in movies_rate_b if item not in movie_list][0:20]
    new_movies_rate_c = [item for item in movies_rate_c if item not in movie_list][0:20]
    new_movies_rate_d = [item for item in movies_rate_d if item not in movie_list][0:20]
    new_movies_rate_random = [item for item in movies_rate_random if item not in movie_list][0:20]
    all_users_recomendations[user_id] = [new_movies_rate_a,new_movies_rate_b,new_movies_rate_c,new_movies_rate_d,new_movies_rate_random]

## example:

In [None]:
all_users_recomendations[1][0]

# Q3, c

In [None]:
#same in Q3,b just split to women and men