# Purpose
1. generate the posting units and tags from the raw data set and feed into the inverted index
2. conduct some analyzing about the dataset and searching result

In [1]:
import os
os.chdir('..')  # change current path to the root of project

from inv_index import Index
from probes import Index_probe
from data_structure import Post_unit, Doc
import pandas as pd
import global_settings as gs
from collections import Counter
import re

os.path.abspath(os.curdir)

'C:\\desktop\\workspace\\moive_recommendation'

# Load the raw data  
1. tags.csv  
2. movies.csv

In [2]:
genScorePath = os.path.join(gs.datasetPath, 'genome-scores.csv')
# genTagPath = os.path.join(gs.datasetPath, 'genome-tags.csv')
tagPath = os.path.join(gs.datasetPath, 'tags.csv')
moviePath = os.path.join(gs.datasetPath, 'movies.csv')

In [3]:
genScoreDF = pd.read_csv(genScorePath, delimiter = ',')
# genTagDF = pd.read_csv(genTagPath, delimiter = ',')
tagDF = pd.read_csv(tagPath, delimiter = ',')
movieDF = pd.read_csv(moviePath, delimiter = ',')

In [22]:
tagDF.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078
5,65,668,bollywood,1368149876
6,65,898,screwball comedy,1368150160
7,65,1248,noir thriller,1368149983
8,65,1391,mars,1368150055
9,65,1617,neo-noir,1368150217


In [5]:
len(tagDF)

465564

In [6]:
# not all the movieId are presenting - not all movie have tags
tagDF.movieId.max() - tagDF.movieId.unique().shape

array([111713])

In [7]:
movieDF.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
len(movieDF)

27278

In [9]:
# not all movie have tag genome scores
genScoreDF.movieId.unique().shape

(10381,)

# Transfer the raw data into posting unit format  
*userId,movieId,tag,timestamp -> tagText, currentId, nextId, previousId, uPropJson, moiveId, status*  
**Note**: same tags made by different user are regard as different occurances, so that the tf is set


In [10]:
def tagClean(tagText):
    tagText = re.sub('[^a-zA-Z _]', '', tagText)  # eliminate symbols which are not letters or space or _
    tagText = tagText.replace(' ', '_')
    if tagText.upper() in ['CON', 'PRN', 'AUX', 'NUL', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9']:
        tagText = '_%s_'%tagText
    return tagText.lower()

In [11]:
# collecting the occurance of tags in terms of one movie
movieTagCounter = Counter()  
longTagCount = 0
for record in tagDF.values:
    movieId = record[1]
    tagText = str(record[2])
    if tagText.count(' ') > 5:  
        longTagCount += 1
    else:
        # if the tag is too long, eliminate it
        tagText = tagClean(tagText)
        if tagText != '':
            movieTagCounter[(movieId, tagText)] += 1

In [12]:
# 19991 out of 465,564 tags are long tags, just leave them there.
longTagCount

1651

In [13]:
flatPUnitList = []  # for collecting the flatten posting units

In [14]:
# tagText, currentId, nextId, previousId, uPropJson, moiveId, status
for movieTagPair in movieTagCounter:
    flatPUnitList.append("%s -1 -1 -1 {\"tf\":%d} %d 1"%(movieTagPair[1], movieTagCounter[movieTagPair], movieTagPair[0]))

# Feed the flatten posting units in to inverted index

In [15]:
idx = Index()

In [16]:
%%time
for flatPUnit in flatPUnitList:
    pUnit = Post_unit.deflatten(flatPUnit)    
    idx.add_post_unit(pUnit)  # the pUnit is assigned new id after this step
    
    # add the corresponding movie info into the index
    if not pUnit.docId in idx.docInfo:
        
        movieInfo = movieDF.loc[movieDF.movieId == pUnit.docId].values.ravel()
        title = movieInfo[1]
        genre = movieInfo[2]
        
        # fileds of doc
        basicInfo = {}
        basicInfo['title'] = title.replace(' ', '_')
        basicInfo['genre'] = genre.replace(' ', '_')
        pUnitIds = [pUnit.currentId]
        
        doc = Doc()
        doc.docId = pUnit.docId
        doc.basicInfo = basicInfo
        doc.pUnitIds = pUnitIds

        idx.add_doc_info(doc)
        
    else:
        idx.docInfo[pUnit.docId].pUnitIds.append(pUnit.currentId)

Wall time: 13 s


In [17]:
idx.persist_index()

2018-12-01 18:10:57,086 [INFO] - Index.py:95 persist last post unit ID
2018-12-01 18:10:57,090 [INFO] - Index.py:68 persist lexicon
2018-12-01 18:10:57,375 [INFO] - Index.py:77 persist posting
2018-12-01 18:12:00,171 [INFO] - Index.py:102 persist doc info


# Analysis  


In [18]:
# The tag set does not have all 27000 movies
len(set(movieDF.movieId.unique()) - set(tagDF.movieId.unique()))

7733

In [19]:
# 44 movies removed due to the tag regard as illgeal
len(set(tagDF.movieId.unique()) - set(idx.docInfo.keys()))

44

In [20]:
# examples of lost movies
list(set(tagDF.movieId.unique()) - set(idx.docInfo.keys()))[:10]

[644, 56069, 110982, 26378, 115467, 71180, 125966, 5520, 54419, 8733]

In [25]:
len(idx.docInfo[32587].pUnitIds)

161

In [70]:
# upper bound of length of posting list
# not the actual length, as the same tags from different user are counted multiples
tgb = tagDF.groupby('tag')
tgb.count()['movieId'][tgb.count()['movieId'] == 3384]

tag
sci-fi    3384
Name: movieId, dtype: int64

# check intersection of two key word  
1. comedy  
2. romance

In [53]:
docSet1 = set()
for pUnitId in idx.lexicon['dark_hero'].pUnitIds:
    docSet1.add(idx.posting[pUnitId].docId)

In [54]:
docSet2 = set()
for pUnitId in idx.lexicon['noir_thriller'].pUnitIds:
    docSet2.add(idx.posting[pUnitId].docId)

In [55]:
docSet1.intersection(docSet2)

{541}

In [59]:
# check the number of tags of 541
len(idx.docInfo[541].pUnitIds)

178

In [71]:
# N
len(idx.docInfo)

19501

# Check shared tags of two movies