In [1]:
#Reading files

import numpy as np
import pandas as pd

r_cols=['user_id', 'item_id', 'rating', 'timestamp']
data_df = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

m_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url','unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item_df = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols,
                     encoding='latin-1')

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
user_df = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1', parse_dates=True) 

g_cols=['genre','genre_id']
genre_df=pd.read_csv('ml-100k/u.genre', sep='|', names=g_cols,
                     encoding='latin-1')

o_cols=['occupation']
occupation_df=pd.read_csv('ml-100k/u.occupation', sep='|', names=o_cols,
                     encoding='latin-1')




In [2]:
#Basic Summary statistics of data file (Ratings file)

data_df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [None]:
#Check if the data file (Ratings file) has Null values
data_df.isnull().any()

user_id      False
item_id      False
rating       False
timestamp    False
dtype: bool

In [None]:
#Rating vs count

import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(x=data_df.rating,data=data_df)
plt.show()

In [None]:
#Average ratings per user

import numpy as np


dftmp = data_df[['user_id','rating']].groupby('user_id').mean()
# Plot histogram
dftmp.plot(kind='hist', bins=50, grid=0, normed=True, edgecolor='black', figsize=(10,5))
# Plot cumulative function on top (couldn't do any other way)
# evaluate the histogram
values, base = np.histogram(dftmp, bins=40, normed=True)
# evaluate the cumulative (multiply by the average distance between points in the x-axis to get UNIT area)
cumulative = np.cumsum(values) * np.diff(base).mean()
# plot the cumulative function
plt.plot(base[:-1], cumulative, c='blue', label='CDF')
plt.xlim(0,5)
plt.legend()
plt.xlabel ('Average movie rating')
plt.ylabel ('Normalized frequency')
plt.title ('Average ratings per user')
plt.show()




In [None]:
#Number of Movies rated by an user

no_of_rated_movies_per_user = data_df.groupby(by='user_id')['rating'].count().sort_values(ascending=False)


ax1 = plt.subplot(121)
sns.kdeplot(no_of_rated_movies_per_user, shade=True, ax=ax1)
plt.xlabel('No of ratings by user')
plt.title("PDF")

ax2 = plt.subplot(122)
sns.kdeplot(no_of_rated_movies_per_user, shade=True, cumulative=True,ax=ax2)
plt.xlabel('No of ratings by user')
plt.title('CDF')

plt.show()



In [None]:
#Basic statistics on Number of times an user has rated movies

ratings_count=data_df.groupby('user_id').size().reset_index(name='counts')


counts=[]
for i in range(0,len(ratings_count)):
    counts.append(ratings_count['counts'][i])

print("Minimum number of times an user has rated movies:",min(counts))
print("Maximum number of times an user has rated movies:",max(counts))
print("Average number of times an user has rated movies:",np.mean(counts))
print("Average number of times an user has rated movies:",np.median(counts))
print("Standard deviation:", np.std(counts))


In [None]:
#Basic statistics of item file(Movie file)

item_df.describe()

In [None]:
#Check if item file (Movie file) has NULL value

item_df.isnull().any()

In [None]:
#Compute the movie count per genre

l=genre_df['genre'].tolist()
ll=[[i,item_df[i].sum()] for i in l]
print(ll)

In [None]:
import matplotlib.pyplot as plt # data visualization library
from wordcloud import WordCloud, STOPWORDS #used to generate world cloud


# Function that control the color of the words
def random_color_func(word=None, font_size=None, position=None,
                      orientation=None, font_path=None, random_state=None):
    h = int(360.0 * tone / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)


#Finally, the result is shown as a wordcloud:
words = dict()
trunc_occurences = ll[0:50]
for s in trunc_occurences:
    words[s[0]] = s[1]
tone = 100 # define the color of the words
f, ax = plt.subplots(figsize=(14, 6))
wordcloud = WordCloud(width=550,height=300, background_color='black', 
                      max_words=1628,relative_scaling=0.7,
                      color_func = random_color_func,
                      normalize_plurals=False)
wordcloud.generate_from_frequencies(words)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
#Word cloud for Each genre movies

import numpy as np # linear algebra
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS

#mpl.rcParams['figure.figsize']=(8.0,6.0)    #(6.0,4.0)
mpl.rcParams['font.size']=12                #10 
mpl.rcParams['savefig.dpi']=100             #72 
mpl.rcParams['figure.subplot.bottom']=.1 


stopwords = set(STOPWORDS)


# wordcloud = WordCloud(
#                          ).generate(str(tags_df['tag']))

wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(user_df['occupation']))

print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()






In [None]:
#Basic statistics of User file

user_df.describe()

In [None]:
#Check if User file has any NULL value

user_df.isnull().any()

In [None]:
user_df.head()

In [None]:
#Male vs Female users count

sns.countplot(x=user_df.gender,data=user_df)
plt.show()

In [None]:
#Users' age vs Count

plt.hist(user_df.age)
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()