##Importing libraries

In [1]:
import requests, os, sqlite3, csv, omdb, pickle, numpy as np, re

##Creating the movies database

In [150]:
with sqlite3.connect('movies_database.db') as connection:
    c = connection.cursor()
    c.execute("DROP TABLE IF EXISTS Movies")

In [151]:
#create database and movies table
with sqlite3.connect('movies_database.db') as connection:
    c = connection.cursor()
    c.execute("CREATE TABLE Movies(imdbID PRIMARY KEY, Title TEXT, Genre TEXT, Plot TEXT, Poster TEXT)")

##Loading the movies into the database

In [152]:
#setting the file path where the csv is present
def loadMovies(path,file):
    '''function requires path on computer where the csv file is stored and the name of the csv file containing
    the title of the films for which information needs to be obtained from the omdb API. The retrieved information
    will be inserted into the movies table belonging to the movies_database'''
    
    filePath = path
    fileName = file
    with sqlite3.connect('movies_database.db') as connection, open(os.path.join(filePath, fileName), "rb") as myFile:
        c = connection.cursor()
        myFileReader = csv.reader(myFile)
        for movies in myFileReader:
            r = omdb.request(t= movies[0],plot="full",r='json') #using indexing since data type of movies is a list
            movieValues = (r.json()['imdbID'],r.json()['Title'],r.json()['Genre'],r.json()['Plot'],
                           r.json()['Poster'])
            c.execute("INSERT INTO Movies VALUES(?, ?, ?, ?, ?)",(movieValues))

In [153]:
loadMovies('/home/clarence/Documents/theDataGeek','movielist.csv')

##Creating a text file containing unique genres

In [386]:
def uniqueGenres(path, filename):
    '''A movie may be categorized into more than one genre, to create a corpus for each genre, we will need to first
    extract each individual genre from the 'Genre' field in the movies table which are saved as tuples. We will then
    proceed to retain only the unique values for genres. Lastly we will use these values to subset movie plots into
    corpora'''

    listofLists = []
    singleList = []
    global uniqueList
    uniqueList = []
    with sqlite3.connect('movies_database.db') as connection, open(os.path.join(path, filename),
                                                                   "wb") as my_file:
        my_file_writer = csv.writer(my_file,delimiter=",",quotechar="'")
        c = connection.cursor()
        c.execute("SELECT Genre FROM Movies")
        for row in c.fetchall():
            listofLists.append(row[0].split(',')) # splitting the multiple genre string into individual list items
        for each_list in listofLists: # first level loop iterates over each list
            for each_item in range(0,len(each_list)): # nested loop iterates over each list index
                singleList.append(each_list[each_item]) # creates one list with duplicate genres
        singleList = [item.strip() for item in singleList] #remove leading and trailing whitespace
        [uniqueList.append(item) for item in singleList if item not in uniqueList] #drop duplicates
        uniqueList.remove('N/A') # remove null values
        uniqueList.sort()
        
        '''create a multidimensional numpy array with rows equal to length of uniqueList and one column.
        Data type is specified as 16 character strings, to store each genre as a list within a list.
        This additional step is required as the writerows function requires that csv values to be written
        are a list of list data structure'''
        
        uniqueArray = np.array(range(len(uniqueList)), dtype='a16').reshape(len(uniqueList),1)
        for item in range(0,len(uniqueList)):
            uniqueArray[item][0] = uniqueList[item]
        my_file_writer.writerows(uniqueArray) # write all rows at once
        del listofLists, singleList, uniqueList #optimizing memory usage

In [387]:
#running the function to create a list of unique genres
uniqueGenres(path='/home/clarence/Documents/theDataGeek/nlpRecommender', filename='genrelist.csv')

In [2]:
#testing if the data was sucessfully written to a csv file
with open(os.path.join('/home/clarence/Documents/theDataGeek/nlpRecommender', 'genrelist.csv'), "rb") as myFile:
    myFileReader = csv.reader(myFile)
    for genres in myFileReader:
        print genres

['Action']
['Adventure']
['Animation']
['Biography']
['Comedy']
['Crime']
['Documentary']
['Drama']
['Family']
['Fantasy']
['History']
['Horror']
['Music']
['Musical']
['Mystery']
['Romance']
['Sci-Fi']
['Short']
['Sport']
['Thriller']
['Western']


##Filter plots by genre and write to .txt files

In [78]:
#code to manually filter plots if they contain specific genre names in the entire string
inPath = '/home/clarence/Documents/theDataGeek/nlpRecommender'
outPath = '/home/clarence/Documents/theDataGeek/nlpRecommender/corpora'
fileName = 'genrelist.csv'

with sqlite3.connect('movies_database.db') as connection, open(os.path.join(outPath,'Western.csv'),
                                                               "wb") as outputFile:
    c = connection.cursor() 
    my_file_writer = csv.writer(outputFile,delimiter=",")  
    c.execute("SELECT Plot FROM Movies WHERE Genre LIKE '%Western%' " )
    my_file_writer.writerows(c.fetchall())
    
del inPath, outPath, fileName

In [None]:
'''create individual corpus based on unique genres. We will use pattern search '%'+xxx+'%' where xxx is the substring
pattern. This is the automated version of the preceeding block of code. The problem is using the iterator value
in this case genre[0] in the LIKE clause - which has been solved by Maruthi'''

inPath = '/home/clarence/Documents/theDataGeek/nlpRecommender'
outPath = '/home/clarence/Documents/theDataGeek/nlpRecommender/corpora'
fileName = 'genrelist.csv'
with sqlite3.connect('movies_database.db') as connection,open(os.path.join(inPath, fileName), "rb") as inputFile:
    myFileReader = csv.reader(inputFile)
    c = connection.cursor()
    for genre in myFileReader:
        '''store output in files named after each genre'''
        with open(os.path.join(outPath,'%s.csv' %genre[0]), "wb") as outputFile:
            my_file_writer = csv.writer(outputFile,delimiter=",",quotechar="'")  
            c.execute("SELECT Plot FROM Movies WHERE Genre LIKE ?", ('%'+genre[0]+'%')) # THANKS MARUTHI! :)
            my_file_writer.writerows(c.fetchall())
            
del inPath, outPath, fileName

##Create tokenized documents for each genre and remove punctutation

In [72]:
# tokenize, change to lower and remove punctuation for each plot filtered in the previous step
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string, nltk

inPath = '/home/clarence/Documents/theDataGeek/nlpRecommender/corpora'
outPath = '/home/clarence/Documents/theDataGeek/nlpRecommender/tokenized'
fileName = 'genrelist.csv'
genreList = [] #store genre names in memory from csv file
tokenizer = WordPunctTokenizer() #instance of WordPunctTokenizer class
lemmatizer = WordNetLemmatizer() #instance of WordNetLemmatizer class

with open(os.path.join('/home/clarence/Documents/theDataGeek/nlpRecommender', fileName), "rb") as inputFile:
          myFileReader = csv.reader(inputFile)
          for i in myFileReader:
              genreList.append(i)

#creating a list of stop words and punctuations which we will remove from each document
stop = stopwords.words('english')
for punctuation in string.punctuation:
    stop.append(punctuation)

for index in range(0,len(genreList)):
    with open(os.path.join(inPath, '%s.txt' %genreList[index][0]), "r") as inputFile,open(
        os.path.join(outPath,'%s.txt' %genreList[index][0]), "w") as outputFile:
        text = inputFile.readlines()
        for lines in text:
        #for lines in range(0,len(text)):
            words = [i for i in lemmatizer.lemmatize(lines.lower())]
            words = [i for i in tokenizer.tokenize(lines.lower()) if i not in stop]
            for each_word in words:
                outputFile.write(str(each_word))
                outputFile.write('\n')
                
del inPath, outPath, fileName, genreList, tokenizer, lemmatizer, text, words

In [76]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('heroic')

'heroic'

##Add categories or genres to the tokenized documents and integrate them into a corpus

In [8]:
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader('/home/clarence/Documents/theDataGeek/nlpRecommender/tokenized', r'.*\.txt',
cat_pattern=r'(\w+)/*') # cat_pattern means use the filename excluding the extension as the category/genre name

In [9]:
reader.words('Action.txt')

[u'nick', u'fury', u'director', u'h', u'e', u'l', u'd', ...]

In [10]:
#listing all the categories in the corpus
reader.categories()

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci',
 'Short',
 'Sport',
 'Thriller',
 'Western']

In [11]:
#validating that the mapping of category to document
reader.categories(['Western.txt'])

['Western']

In [42]:
#conditional frequency distribution 
cfd = nltk.ConditionalFreqDist((genre, word)
                               for genre in reader.categories()
                               for word in reader.words(categories=genre))

In [43]:
cfd.conditions()

['Mystery',
 'Romance',
 'Short',
 'Sci',
 'Family',
 'Horror',
 'Thriller',
 'Sport',
 'Crime',
 'Drama',
 'Fantasy',
 'Western',
 'Animation',
 'Music',
 'Adventure',
 'Action',
 'Comedy',
 'Documentary',
 'Musical',
 'Biography',
 'History']

##Calculating chi-square value for each word genre pair

In [35]:
#create a contingency table from the conditional frequency distribution to be used in a chi-square test of assoc.
def nltk_cfd_to_pd_dataframe(cfd):
    """ Converts an nltk.ConditionalFreqDist object into a pandas DataFrame object. """
    import pandas as pd
    df = pd.DataFrame()
    for cond in cfd.conditions():
        col = pd.DataFrame(pd.Series(dict(cfd[cond])))
        col.columns = [cond]
        df = df.join(col, how = 'outer')

    df = df.fillna(0)

    return df

In [44]:
contingency = nltk_cfd_to_pd_dataframe(cfd)

In [45]:
#writing the dataframe to disk to remove punctuation, numbers etc. which may have remained. Will try to automate
contingency.to_csv('/home/clarence/Documents/theDataGeek/nlpRecommender/contingency.csv',sep='\t')

In [65]:
#reading the cleanded dataframe back into memory
import pandas as pd

contingency = pd.read_csv('/home/clarence/Documents/theDataGeek/nlpRecommender/contingency.csv',sep='\t',
                          index_col='Word',skiprows=0)

In [66]:
#calculating sum of each row
rowSums = contingency.sum(1)
#calculating the sum of each column
colSums= contingency.sum(0)

In [67]:
'''creating a 2130 X 21 multidimensional array to hold expected values of each cell, where
E = (rowSum*colSum)/Total'''
expected = np.array(range(len(rowSums)*len(colSums)),dtype='float').reshape(len(rowSums),len(colSums))

from __future__ import division # division from python 3 ensures floats and not ints are returned

for i in range(0,len(rowSums)):
    for j in range(0,len(colSums)):
        expected[i][j] = (rowSums[i]*colSums[j])/(rowSums.sum())

In [173]:
'''converting the array into a dataframe sharing column names and index as that of contigency, this is important
for the mathematical operations to follow'''
expectedDF = pd.DataFrame(expected,columns=contingency.columns,index=contingency.index)

In [70]:
#writing the expected values to csv, this will be required to build signatures of individual movies
expectedDF.to_csv('/home/clarence/Documents/theDataGeek/nlpRecommender/expected.csv', sep='\t')

In [72]:
'''first part of the equation (Observed-Expected)^2'''
chiSqVals = pd.DataFrame((contingency.values-expectedDF.values)**2, columns=contingency.columns,
                         index=contingency.index)

'''second part of the equation [(O-E)^2]/E'''
chiSqVals = chiSqVals.div(expectedDF.ix[0],axis='columns')

In [178]:
'''write out the final file in csv format and delete all other intermediate dataframes stored in memory'''
chiSqVals.to_csv('/home/clarence/Documents/theDataGeek/nlpRecommender/chiSqVals.csv',sep='\t')
del contingency, expectedDF,chiSqVals

##Calculating Genre Similarity

In [175]:
#load the chi-square value table for genres
import pandas as pd
genreChi = pd.read_csv('/home/clarence/Documents/theDataGeek/nlpRecommender/chiSqVals.csv', sep='\t')
expectedDF = pd.read_csv('/home/clarence/Documents/theDataGeek/nlpRecommender/expected.csv', sep='\t')

In [55]:
#the same process can be used for movie to movie similarity scores
from scipy import spatial

v1 = genreChi['Music']
v2 = genreChi['Musical']
result = 1 - spatial.distance.cosine(v1, v2)
print result

0.342935607224


In [75]:
#determining the number of words to retain as signatures for each genre using various quantile values
count = 0
df = genreChi.drop(genreChi.columns[0], axis=1, inplace=False)
for col in df.columns:
    for x in df[col]:
        if x >= df[col].quantile(0.95):
            count += 1
    print "number of words for %s are %s" %(col,count)
    count = 0 #resetting the counter to zero

number of words for Mystery are 164
number of words for Romance are 120
number of words for Short are 118
number of words for Sci are 158
number of words for Family are 136
number of words for Horror are 134
number of words for Thriller are 116
number of words for Sport are 136
number of words for Crime are 108
number of words for Drama are 205
number of words for Fantasy are 182
number of words for Western are 112
number of words for Animation are 120
number of words for Music are 112
number of words for Adventure are 110
number of words for Action are 107
number of words for Comedy are 114
number of words for Documentary are 124
number of words for Musical are 110
number of words for Biography are 112
number of words for History are 107


In [191]:
df1, df2 = pd.DataFrame(genreChi[['Word','Mystery']]), pd.DataFrame(expectedDF[['Word','Mystery']])
df1.columns, df2.columns = ['Word','MysChi'], ['Word','MysExp']
df1 = pd.merge(df1,df2)
df1 = df1[(df1.MysChi >= df1.MysChi.quantile(0.95))]
df1.head(2)

Unnamed: 0,Word,MysChi,MysExp
6,absolute,6.997358,0.080545
11,accused,27.989433,0.161089


In [2]:
#combinations/observations required to be stored in the DB for pairwise distance calculations for all movies

import math

def nCr(n,r):
    f = math.factorial
    return f(n) / f(r) / f(n-r)

if __name__ == '__main__':
    print nCr(100,2)

4950


In [None]:
'''For each genre create a dataframe by joining words, chisq-value and expected value. We will retain the top 5%
of words as signatures for the genre. This list will be written to disk.'''

outPath = '/home/clarence/Documents/theDataGeek/nlpRecommender/genre_signatures'
for col in range(1,genreChi.columns):
    
with open(os.path.join(outPath, '%s.csv' %col),"wb") as outPut:

In [31]:
#list comprehension example
[word95percentile.append(x) for x in genreChi.Mystery if x >= genreChi.Mystery.quantile(0.95)]
len(word95percentile)

492

In [132]:
a = np.array([1,0,0,1,1,1,0,1])
b = np.array([1,0,1,0,1,1,0,0])

print 1/spatial.distance.euclidean(a, b)
print
print 1-spatial.distance.cosine(a, b)

0.57735026919

0.67082039325


In [100]:
r = omdb.request(t= 'The Avengers',plot='full',r='json')

In [101]:
r.json()

{u'Actors': u'Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth',
 u'Awards': u'Nominated for 1 Oscar. Another 31 wins & 66 nominations.',
 u'Country': u'USA',
 u'Director': u'Joss Whedon',
 u'Genre': u'Action, Adventure, Sci-Fi',
 u'Language': u'English, Russian',
 u'Metascore': u'69',
 u'Plot': u"Nick Fury is director of S.H.I.E.L.D, an international peace keeping agency. The agency is a who's who of Marvel Super Heroes, with Iron Man, The Incredible Hulk, Thor, Captain America, Hawkeye and Black Widow. When global security is threatened by Loki and his cohorts, Nick Fury and his team will need all their powers to save the world from disaster.",
 u'Poster': u'http://ia.media-imdb.com/images/M/MV5BMTk2NTI1MTU4N15BMl5BanBnXkFtZTcwODg0OTY0Nw@@._V1_SX300.jpg',
 u'Rated': u'PG-13',
 u'Released': u'04 May 2012',
 u'Response': u'True',
 u'Runtime': u'143 min',
 u'Title': u'The Avengers',
 u'Type': u'movie',
 u'Writer': u'Joss Whedon (screenplay), Zak Penn (story), Joss Whedon (s

In [96]:
#example of how to use the OMDB api directly
r1 = requests.get('http://www.omdbapi.com/?t=The+Avengers&plot=full&r=json')

In [141]:
str(r1.json()['imdbID'])

'tt0848228'

In [77]:
movieHeaders = list(r.json().keys())
print movieHeaders

[u'Plot', u'Rated', u'tomatoImage', u'Title', u'DVD', u'tomatoMeter', u'Writer', u'tomatoUserRating', u'Production', u'Actors', u'tomatoFresh', u'Type', u'imdbVotes', u'Website', u'tomatoConsensus', u'Poster', u'tomatoRotten', u'Director', u'Released', u'tomatoUserReviews', u'Awards', u'Genre', u'tomatoUserMeter', u'imdbRating', u'Language', u'Country', u'BoxOffice', u'Runtime', u'tomatoReviews', u'imdbID', u'Metascore', u'Response', u'tomatoRating', u'Year']


In [71]:
from imdb import IMDb
ia= IMDb()

In [78]:
movieHeaders[0]

u'Plot'

In [79]:
for i in range(0, len(r.json().keys())):
  type(r.json().values()[i])

In [80]:
len(r.json().keys())

34

In [81]:
int(r.json().values()[5])

92

In [82]:
print r.json()['Genre']

Action, Adventure, Sci-Fi


In [99]:
MovieID,Title,Genre=[str(r.json()['imdbID']),str(r.json()['Title']),str(r.json()['Genre'])]

In [None]:
ipcluster