[View in Colaboratory](https://colab.research.google.com/github/ShriPunta/Netflix-Graph-Dataset-Project/blob/master/ParsingTheCombinedFile.ipynb)

In [0]:
!pip install -U -q PyDrive


In [0]:
#!pip install python-igraph

**Setup all the Authentication for PyDrive**

In [0]:

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import io
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.discovery import build
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import files

In [0]:
%matplotlib inline

In [0]:
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


---

**Method to combine all the records**

In [0]:
def merge_all_texts(file_list):
  import shutil

  with open('AllMerged.txt','wb') as wfd:
    for f in [file_list]:
      with open(f,'rb') as fd:
        shutil.copyfileobj(fd, wfd, 1024*1024*10)
        #10MB per writing chunk to avoid reading big file into memory.


** *Method*: To read the contents of a local Google File**



In [0]:
def read_drive_file_into_variable(file_id_to_read):
  drive_service = build('drive', 'v3')
  request = drive_service.files().get_media(fileId=file_id_to_read)
  downloaded = io.BytesIO()
  downloader = MediaIoBaseDownload(downloaded, request)
  done = False
  while done is False:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
    status, done = downloader.next_chunk()
    prog = int(status.progress() * 100)
    print("Download "+str(prog))
  
  #Set the pointer to the start
  downloaded.seek(0)
  #print('Downloaded file contents are: {}'.format(downloaded.read()))
  
  #Read Everthing into a variable called "View", its in a "Bytes" datatype
  view = downloaded.read1(-1)
  #len(view)
  
  #decode Bytes to String format
  decoded = view.decode(encoding="utf-8")
  #type(decoded)
  
  #Convert this continous string to List for each new line
  variable_to_set = decoded.splitlines()
  #type(splitted)
  
  return variable_to_set
  
  

---

**Read the main two files into variables**

In [0]:
movie_Title_file_id = '0B8qgJkz0ynl8czBBNG9qZ2JPeW9RaWVuZktobUE1b29qaER3'
part_rating_file_id = '1iABtudmoCPxcFYiYQ0cQwfcQCZtfHsHd'
test_file_id = '11QhP0HwV7x6huJX3-JqJzGv7shMiZVPH'
merged_file_id = '1zhm2Wo8qBun5p01z_hjZlkww1TIbkTJy'
merge_first3_files = '1R8g2MWa1Czpzt7EAGUfcvADJ-yIB2Yi9'

In [47]:
key_file_list = read_drive_file_into_variable(movie_Title_file_id)

Download 100


In [0]:
#merged_file3_list = read_drive_file_into_variable(merge_first3_files)

In [0]:
#merged_file_list = read_drive_file_into_variable(merged_file_id)

In [50]:
rating_file_list = read_drive_file_into_variable(part_rating_file_id)

Download 21
Download 42
Download 63
Download 84
Download 100


In [0]:
#test_file_list = read_drive_file_into_variable(test_file_id)

In [0]:
final_file_list_to_use = rating_file_list

---

**Convert the key file into a dataframe**

In [53]:
#Read the key file into a Panda dataframe
key_file_df = pd.DataFrame([sub.split(",") for sub in key_file_list],columns = ["MovieId","Year","Name","Genre1","Genre2","Genre3"])

key_file_df.head()

Unnamed: 0,MovieId,Year,Name,Genre1,Genre2,Genre3
0,1,2003,Dinosaur Planet,"""Animation",Documentary,"Family"""
1,3,1997,Character,"""Crime",Drama,"Mystery"""
2,6,1997,Sick,Drama,,
3,7,1992,8 Man,"""Action","Sci-Fi""",
4,10,2001,Fighter,Documentary,,


In [54]:
key_file_df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11066 entries, 0 to 11065
Data columns (total 6 columns):
MovieId    11066 non-null object
Year       11066 non-null object
Name       11066 non-null object
Genre1     11066 non-null object
Genre2     8381 non-null object
Genre3     4876 non-null object
dtypes: object(6)
memory usage: 3.8 MB


**Clean the dataframe**

In [0]:
#key_file_df['Genre'] = key_file_df[['Genre1','Genre2','Genre3']].apply(lambda x: ''.join(x), axis=1)
# key_file_df.head()

def clean_the_movie_key_df(key_file_df):
  #Convert NaN to blank
  key_file_df = key_file_df.replace(np.nan, '', regex=True)
    
  #Combine the Genres into a single column
  key_file_df['Genre'] = key_file_df['Genre1']  #+ ',' + key_file_df['Genre2'] + ','+ key_file_df['Genre3']
  
  

  #Clean the column by removing double quotes; Also converts to string
  key_file_df['Genre'] = key_file_df['Genre'].str.replace('"', '')

  #Drop the unnecessary columns
  #key_file_df.drop(['Genre1','Genre2','Genre3'], axis=1, inplace=True)
  key_file_df.drop(['Genre1'], axis=1, inplace=True)
  
  #Convert to numeric
  key_file_df['MovieId'] = pd.to_numeric(key_file_df['MovieId'],errors = 'coerce')
  
  #Convert to numeric
  key_file_df['Year'] = pd.to_numeric(key_file_df['Year'],errors = 'coerce')
  
  #Convert to String
  key_file_df['Name'] = key_file_df['Name'].astype('str')
  
  
  #Set MovieId as the Index
  key_file_df.set_index('MovieId',inplace=True)
  
  #Drop any rows which NaN or NULL
  key_file_df.dropna(axis=0, how='any',inplace=True)

  return key_file_df


In [0]:
key_file_df = clean_the_movie_key_df(key_file_df)

In [59]:
#Gives detailing on how many null or na there are
key_file_df.isnull().sum()

Year      0
Name      0
Genre2    0
Genre3    0
Genre     0
dtype: int64

In [60]:
key_file_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11064 entries, 1 to 17770
Data columns (total 5 columns):
Year      11064 non-null float64
Name      11064 non-null object
Genre2    11064 non-null object
Genre3    11064 non-null object
Genre     11064 non-null object
dtypes: float64(1), object(4)
memory usage: 2.9 MB


** We store the index(which are the movie Ids) as a list; to be used as a reference later**

In [61]:
key_movie_values = key_file_df.index.values
key_movie_values


array([    1,     3,     6, ..., 17765, 17768, 17770])

---

**Create a Map of each movie with the number of ratings it has**

In [0]:
def fill_dict_count(list_to_iterate):
  each_movie_votes = dict()
  for ele in list_to_iterate:
    if ele.find(':') != -1:
      key_to_search = (ele.split(':'))[0]
      
      #Get a map of how many ratings are there per movie
      each_movie_votes[key_to_search] = 0
    else:
      each_movie_votes[key_to_search] +=1
   
  return each_movie_votes
      
    

**---> PARSNG and SAMPLING <---**

**SAMPLING  : It takes the 23 million ratings, and only keeps limited number of ratings per movie (This is controlled by the 'limiter' variable).**

**We only want values whose genre we have, hence we weed out those movies, who are not present in the movie_Title_file**

**----**
  
**PARSING :  It also adds a comma separated value of the movie id to the tuple. (It is split later to create a dataframe).**

In [0]:
def do_sampling(list_to_iterate,file_rating_dict):
  flag=True
  percent = 0.03
  count = 0
  refined_list = []
  key_to_search = 0
  limiter = 1
    
  for ele in list_to_iterate:
    
    #If the element has a ':' then its a movie id
    if ele.find(':') != -1:
      count = 1
      #we will remove the ':' from the tuple
      #This is done as we can then successfully apply the pd.to_numeric method to drastically reduce the dataframe size
      key_to_search = (ele.split(':'))[0]
      
      #We want only those movies which are present in the movie_Title_file
      if int(key_to_search) in key_movie_values:
        #If this movie is present, set flag as true
        flag = True
        
        #Set Limiter value
        limiter = round(file_rating_dict.get(key_to_search) * percent)
      else:
        flag = False
    else:
      #If the flag is false(i.e. movie not found) or if the count more than the limit; SKIP the record
      if count > limiter or flag is False:
        continue
      #Add a comma separated value of the movieId to split and form a column later
      refined_list.append(ele+','+ str(key_to_search))
      count+=1
      
  return refined_list

In [0]:
count_strength = fill_dict_count(final_file_list_to_use)

In [0]:
refined_list = do_sampling(final_file_list_to_use,count_strength)


In [66]:
max_values = sorted(set(count_strength.values()),reverse=True)
print(max_values)

[193941, 162597, 160454, 156183, 154832, 153996, 151292, 149866, 145519, 140979, 140154, 139428, 137170, 136850, 135601, 135431, 134241, 128446, 121769, 121207, 118413, 117270, 116762, 116362, 114997, 113674, 113377, 110159, 108771, 108606, 107443, 106807, 106020, 105697, 104632, 104362, 101188, 101118, 100248, 99812, 98720, 98700, 98696, 97939, 93241, 90591, 90450, 90387, 90010, 89999, 89865, 89714, 87622, 87389, 87139, 85050, 83849, 83680, 81278, 81260, 80295, 80136, 79845, 78980, 78878, 78213, 77314, 76944, 75148, 75140, 74829, 74076, 73795, 73337, 73004, 72381, 71614, 67575, 66266, 65700, 65477, 65363, 63042, 62809, 62628, 62457, 61174, 61019, 60787, 60471, 60415, 59518, 59137, 58527, 58364, 58149, 56918, 56504, 56213, 55740, 55605, 54922, 54617, 54193, 52470, 52388, 52315, 52025, 51948, 51856, 51622, 50353, 50202, 50196, 49969, 48877, 48617, 48496, 48190, 48004, 47811, 47651, 47167, 46978, 46150, 45636, 45620, 45069, 44295, 44001, 43518, 43421, 43122, 42933, 42843, 42727, 42540, 4

In [67]:
print(len(count_strength.keys()))

4499


---
**Get a judgement of the size of the earlier list and size after sampling**


In [68]:
from sys import getsizeof
#print("Total number of elements -->",len(rating_file_list),"  Size they occupy in bytes -->",getsizeof(rating_file_list))

print("Total number of Elements in rating_file_list list -->",len(final_file_list_to_use),"  Size they occupy in bytes -->",getsizeof(final_file_list_to_use))
print("Total number of Elements in refined_list list -->",len(refined_list),"  Size they occupy in bytes -->",getsizeof(refined_list))

Total number of Elements in rating_file_list list --> 24058263   Size they occupy in bytes --> 209183992
Total number of Elements in refined_list list --> 585311   Size they occupy in bytes --> 4826312


---

**Convert rating File into dataframe**

In [0]:
#Read the key file into a Panda dataframe
test_file_df = pd.DataFrame([sub.split(",") for sub in refined_list],columns = ["UserId","Rating","YearWatched","MovieId"])

#Initialize a blank column called MovieId ; to be filled latter
test_file_df[['MovieId']] = test_file_df[['MovieId']].apply(pd.to_numeric)

#Need memory optimization, convert the object type to numeric
test_file_df[['UserId']] = test_file_df[['UserId']].apply(pd.to_numeric)

#Need memory optimization, convert the object type to numeric
test_file_df[['Rating']] = test_file_df[['Rating']].apply(pd.to_numeric)

#Convert the column to datetime and keep only the year
test_file_df['YearWatched'] = pd.to_datetime(test_file_df['YearWatched']).dt.year

#Need memory optimization, convert the object type to numeric
test_file_df['UserId'] = pd.to_numeric(test_file_df['UserId'],errors='ignore')

#Copy the Genre Column from key file pandas dataframe
test_file_df['Genre'] = test_file_df['MovieId'].map(key_file_df['Genre'])

#Copy the Year Released Column from key file pandas dataframe
test_file_df['YearReleased'] = test_file_df['MovieId'].map(key_file_df['Year'])

#Copy the Name Column from key file pandas dataframe
test_file_df['Name'] = test_file_df['MovieId'].map(key_file_df['Name'])


In [70]:
#Applying these changes in datatypes brought memory usage from 5.4 GB to 2 GB
test_file_df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585311 entries, 0 to 585310
Data columns (total 7 columns):
UserId          585311 non-null int64
Rating          585311 non-null int64
YearWatched     585311 non-null int64
MovieId         585311 non-null int64
Genre           585311 non-null object
YearReleased    585311 non-null float64
Name            585311 non-null object
dtypes: float64(1), int64(4), object(2)
memory usage: 98.1 MB


In [71]:
test_file_df.head()

Unnamed: 0,UserId,Rating,YearWatched,MovieId,Genre,YearReleased,Name
0,1488844,3,2005,1,Animation,2003.0,Dinosaur Planet
1,822109,5,2005,1,Animation,2003.0,Dinosaur Planet
2,885013,4,2005,1,Animation,2003.0,Dinosaur Planet
3,30878,4,2005,1,Animation,2003.0,Dinosaur Planet
4,823519,3,2004,1,Animation,2003.0,Dinosaur Planet


In [72]:
#Create a matrix which is the same size of test_file_df matrix, but instead has True/False about which if the value is NaN
the_NaN_matrix = test_file_df.isnull().sum()
the_NaN_matrix

UserId          0
Rating          0
YearWatched     0
MovieId         0
Genre           0
YearReleased    0
Name            0
dtype: int64

In [73]:
test_file_df['Length'] = test_file_df['Genre'].str.len()
test_file_df['Genre'] = test_file_df['Genre'].str.replace(',,', '')
test_file_df.sort_values(by='Length').head()


Unnamed: 0,UserId,Rating,YearWatched,MovieId,Genre,YearReleased,Name,Length
473622,41412,2,2001,3715,War,1985.0,Commando,3
473833,264160,3,2005,3715,War,1985.0,Commando,3
473832,1324577,2,2005,3715,War,1985.0,Commando,3
473831,1569850,3,2005,3715,War,1985.0,Commando,3
473830,885635,4,2005,3715,War,1985.0,Commando,3


In [74]:
infor_df = pd.DataFrame({'count' : test_file_df.groupby( [ "MovieId"] ).size()})
infor_df['MeanRating'] = test_file_df.groupby( [ "MovieId"])['Rating'].mean()

#print(infor_df.sort_values(by='count',ascending=False))
infor_df



Unnamed: 0_level_0,count,MeanRating
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,16,3.687500
3,60,3.683333
6,31,2.967742
7,3,1.666667
10,7,3.285714
12,16,3.312500
15,9,3.444444
16,81,3.024691
17,213,2.943662
18,322,3.791925


In [0]:
#Mean Rating for the movie
test_file_df['MeanRatingForMovie'] = test_file_df['MovieId'].map(infor_df['MeanRating'])

In [76]:
test_file_df

Unnamed: 0,UserId,Rating,YearWatched,MovieId,Genre,YearReleased,Name,Length,MeanRatingForMovie
0,1488844,3,2005,1,Animation,2003.0,Dinosaur Planet,9,3.687500
1,822109,5,2005,1,Animation,2003.0,Dinosaur Planet,9,3.687500
2,885013,4,2005,1,Animation,2003.0,Dinosaur Planet,9,3.687500
3,30878,4,2005,1,Animation,2003.0,Dinosaur Planet,9,3.687500
4,823519,3,2004,1,Animation,2003.0,Dinosaur Planet,9,3.687500
5,893988,3,2005,1,Animation,2003.0,Dinosaur Planet,9,3.687500
6,124105,4,2004,1,Animation,2003.0,Dinosaur Planet,9,3.687500
7,1248029,3,2004,1,Animation,2003.0,Dinosaur Planet,9,3.687500
8,1842128,4,2004,1,Animation,2003.0,Dinosaur Planet,9,3.687500
9,2238063,3,2005,1,Animation,2003.0,Dinosaur Planet,9,3.687500


In [0]:
#To download the Panda Data Frame
test_file_df.to_csv('test_file_df.csv',index=False)
files.download('test_file_df.csv')



---



---
 ** *Method*: To see the GPU usage**

In [0]:
def checkGPU():
  # memory footprint support libraries/code
  !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
  !pip install gputil
  !pip install psutil
  !pip install humanize
  import psutil
  import humanize
  import os
  import GPUtil as GPU
  GPUs = GPU.getGPUs()
  # XXX: only one GPU on Colab and isn’t guaranteed
  gpu = GPUs[0]
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " I Proc size: " + humanize.naturalsize( process.memory_info().rss))
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
  


---

In [80]:
checkGPU()

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/45/99/837428d26b47ebd6b66d6e1b180e98ec4a557767a93a81a02ea9d6242611/GPUtil-1.3.0.tar.gz
Building wheels for collected packages: gputil
  Running setup.py bdist_wheel for gputil ... [?25l- done
[?25h  Stored in directory: /content/.cache/pip/wheels/17/0f/04/b79c006972335e35472c0b835ed52bfc0815258d409f560108
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.3.0
Collecting humanize
  Downloading https://files.pythonhosted.org/packages/8c/e0/e512e4ac6d091fc990bbe13f9e0378f34cf6eecd1c6c268c9e598dcf5bb9/humanize-0.5.1.tar.gz
Building wheels for collected packages: humanize
  Running setup.py bdist_wheel for humanize ... [?25l- \ done
[?25h  Stored in directory: /content/.cache/pip/wheels/69/86/6c/f8b8593bc273ec4b0c653d3827f7482bb2001a2781a73b7f44
Successfully built humanize
Installing collected packages: humanize
Successfully installed humanize-0.5.1
Gen RAM Free: 8.



---

## **CODE GRAVEYARD**



---



In [0]:
#This Method gives the Numpy shape error; reason unidentified

# def do_sampling(list_to_iterate):
#   flag=True
#   #Limiter variable
#   limiter = 100
#   count = 0
#   #variable to be returned
#   refined_list = []
#   key_to_search = 0
#   movieId_to_search=''
#   for ele in list_to_iterate:
#     #If the element has a ':' then its a movie id
#     if ele.find(':') != -1:
#       count = 1
#       #we will remove the ':' from the tuple
#       #This is done as we can then successfully apply the pd.to_numeric method to drastically reduce the dataframe size
#       movieId_to_search = ele.replace(':','')
#       #We want only those movies which are present in the movie_Title_file
      
#       if movieId_to_search in key_movie_values:
#         print('found')
#         #If this movie is present, set flag as true
#         flag = True
#         refined_list.append(movieId_to_search)
#       else:
#         flag = False
#     else:
#       #If the flag is false(i.e. movie not found) or if the count more than the limit; SKIP the record
#       if count > limiter or flag is False:
#         continue
#       #Add a comma separated value of the movieId to split and form a column later
#       ele += ',' +str(movieId_to_search)
      
#       refined_list.append(ele)
      
#       count+=1
#   return refined_list



---

**BELOW 2 blocks have been deemed deprecated **



---



In [0]:
#Ultra fast method to get the indexes of the rows we need to drop; Reduces time from a minute to a second
#For records which only had the movie id and nothing else, will have NaN for columns other than the first column
#fancy_list = test_file_df[the_NaN_matrix['Rating'] == True].index.values

In [0]:
#type(fancy_list[2])

---
**This below block is deprecated, due to the faster and optimized ways.**

In [0]:
# #We will store the rows to drop in this
# rows_to_drop = []
# currentMovieId = 0
# list_movieId = []


# #Iterate over the dataframe to split it
# for index, row in test_file_df.iterrows():
#   if the_NaN_matrix.iloc[index,2]:
#     #If entered it means that there is a colon on the row and it is a movie id
#     #row gives the first character on that row
#     #currentMovieId = row['UserId']
#     rows_to_drop.append(index)
#   else:
#     list_movieId.append(currentMovieId)
#     #print("Not")
#     #Assign the movie id to the consecutive rows
#     #test_file_df.iloc[index,3] = currentMovieId

# #Create a series object from the list
# #MovieIdSeries = pd.Series(list_movieId)  



---



In [0]:
#Drop the rows which had the movie Id and the columns to get a seamless dataframe
#test_file_df.drop(test_file_df.index[fancy_list],inplace=True)




---





---

