# Template 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark
%config Completer.use_jedi = False

In [2]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

import pandas as pd
import json

In [3]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.19.0

Compiler    : GCC 8.3.0
OS          : Linux
Release     : 5.8.0-7630-generic
Machine     : x86_64
Processor   : 
CPU cores   : 8
Architecture: 64bit

Git hash: 1f1fb0a39af877f76c3c65a57dd36db29f6b1860

Git repo: https://github.com/ysraell/dbsc.git

Git branch: main

nltk      : 3.5
re        : 2.2.1
numpy     : 1.19.5
pandas    : 1.2.1
ipywidgets: 7.6.3
json      : 2.0.9
sys       : 3.7.9 (default, Jan 12 2021, 17:26:22) 
[GCC 8.3.0]

CPU	: Intel(R) Xeon(R) CPU E3-1241 v3 @ 3.50GHz
Mem:           31G
Swap:         4.0G


In [27]:
def prepr(*args,**kwargs):
    return print(repr(*args,**kwargs))

def strip_punctuation(s,join_str=''):
    return join_str.join(c for c in s if c not in string.punctuation)

def special2space(s):
    return ''.join([c if c not in punctuation_new else ' ' for c in s ])

# contractions remover
contractions = re.compile(r"'|-|\"")
# all non alphanumeric
symbols = re.compile(r'(\W+)', re.U)
# single character removal
singles = re.compile(r'(\s\S\s)', re.I|re.U)
# separators (any whitespace)
seps = re.compile(r'\s+')

def clean_punkt_singles_contract(text):
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # contractions remover
    text = contractions.sub('', text)
    # all non alphanumeric
    text = symbols.sub(r' \1 ', text)
    # separators (any whitespace)
    text = seps.sub(' ', text)
    return text

def clean_text(text):
    ## Remove punktuations and symbols
    text = clean_punkt_singles_contract(text)
    
    '''Remove unwanted characters and extra spaces from the text'''
    text = re.sub("'s", '', text) 
    text = re.sub(r'\n', ' ', text) 
    text = re.sub(r'[{}@,:*>()\\#%+=\[\]]','', text)
    text = re.sub('a0','', text)
    text = re.sub('\'92t','\'t', text)
    text = re.sub('\'92s','\'s', text)
    text = re.sub('\'92m','\'m', text)
    text = re.sub('\'92ll','\'ll', text)
    text = re.sub('\'91','', text)
    text = re.sub('\'92','', text)
    text = re.sub('\'93','', text)
    text = re.sub('\'94','', text)
    #text = re.sub('\.','. ', text)
    text = re.sub('\!','! ', text)
    text = re.sub('\?','? ', text)
    text = re.sub(' +',' ', text)
    text = re.sub('_',' ', text)
    text = re.sub('\.',' ', text)

    return text

def proc_text(text, min_size_word = 1):
    text = str(text)
    tmp = tokenizer.tokenize(text)
    tmp = [w for w in tmp if w not in string.punctuation]
    tmp = [w for w in tmp if w not in stopwords.words('english')]
    tmp = [w for w in tmp if len(w)>=min_size_word] 
    return [word.lower() for word in tmp if any(c.isalpha() for c in word)]

In [28]:
!ls ../data/

actor2id.json  id2movie.json  movie_actor.csv  token2term.json
casts.csv      movie2id.json  term2token.json


In [29]:
df_movie_actor = pd.read_csv('../data/movie_actor.csv')

In [30]:
dict_id2movie = df_movie_actor.Movie.drop_duplicates().reset_index(drop=True).to_dict()

In [31]:
dict_movie2id = {m:k for k,m in dict_id2movie.items()}

In [32]:
print(dict_id2movie[0])
print(dict_movie2id[dict_id2movie[0]])

Pygmalion
0


In [33]:
df_movie_actor['movie_id'] = df_movie_actor.Movie.apply(lambda x: dict_movie2id[x])

In [34]:
dict_actor2movieid = df_movie_actor[['Actor','movie_id']].groupby('Actor').agg(list).to_dict()['movie_id']

In [35]:
with open('../data/id2movie.json','w') as f:
    json.dump(dict_id2movie, f)

with open('../data/movie2id.json','w') as f:
    json.dump(dict_movie2id, f)
    
with open('../data/actor2id.json','w') as f:
    json.dump(dict_actor2movieid, f)

In [36]:
terms = df_movie_actor.Movie.drop_duplicates().tolist() + df_movie_actor.Actor.drop_duplicates().tolist()

In [37]:
df_dataset = pd.DataFrame(zip(terms,map(proc_text,terms)),columns=['golden_terms','golden_tokens'])
df_dataset = df_dataset.loc[df_dataset.golden_tokens.apply(lambda x: x != [])].reset_index(drop=True)

In [38]:
dict_token2term = df_dataset.explode('golden_tokens').reset_index(drop=True).set_index('golden_tokens').to_dict()['golden_terms']

In [39]:
with open('../data/token2term.json','w') as f:
    json.dump(dict_token2term, f)

In [43]:
dict_term2token = df_dataset.set_index('golden_terms').to_dict()['golden_tokens']

In [44]:
with open('../data/term2token.json','w') as f:
    json.dump(dict_term2token, f)