## Bibliotecas

In [4]:
#importando bibliotecas
import numpy as np
import pandas as pd 
import nltk
nltk.download('punkt_tab')
from nltk import sent_tokenize, word_tokenize
import string
import unidecode
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

from datetime import datetime

[nltk_data] Downloading package punkt_tab to /home/bruno/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/bruno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Carregando Dataset

In [5]:
#Abrindo arquivos
df = pd.read_csv("filmes.csv")
df.head()

Unnamed: 0,genres,averageRating,numVotes,sinopse,primaryTitle,startYear,runtimeMinutes,actors_names,directors_names
0,Family,7.1,387992,When two kids find and play a magical board ga...,Jumanji,1995,104,['Robin Williams' 'Jonathan Hyde' 'Jonathan Hy...,['Joe Johnston']
1,Romance,6.7,30265,John and Max resolve to save their beloved bai...,Grumpier Old Men,1995,101,['Walter Matthau' 'Jack Lemmon' 'Burgess Mered...,['Howard Deutch']
2,Romance,6.0,12585,"Based on Terry McMillan's novel, this film fol...",Waiting to Exhale,1995,124,['Gregory Hines' 'Dennis Haysbert' 'Mykelti Wi...,['Forest Whitaker']
3,"Romance,Family",6.1,42555,George Banks must deal not only with his daugh...,Father of the Bride Part II,1995,106,['Steve Martin' 'Martin Short' 'George Newbern...,['Charles Shyer']
4,"Crime,Action",8.3,738636,A group of high-end professional thieves start...,Heat,1995,170,['Al Pacino' 'Robert De Niro' 'Val Kilmer' 'Jo...,['Michael Mann']


In [6]:
#Informaçoes dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30358 entries, 0 to 30357
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   genres           30358 non-null  object 
 1   averageRating    30358 non-null  float64
 2   numVotes         30358 non-null  int64  
 3   sinopse          30358 non-null  object 
 4   primaryTitle     30358 non-null  object 
 5   startYear        30358 non-null  int64  
 6   runtimeMinutes   30358 non-null  object 
 7   actors_names     30358 non-null  object 
 8   directors_names  30358 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 2.1+ MB


In [7]:
#Verificando as features
df.columns

Index(['genres', 'averageRating', 'numVotes', 'sinopse', 'primaryTitle',
       'startYear', 'runtimeMinutes', 'actors_names', 'directors_names'],
      dtype='object')

In [8]:
#Feature a ser analisada (sinopse) 
data_features = df['sinopse'] #Tratando com Serie de string
data_features.head(10)

0    When two kids find and play a magical board ga...
1    John and Max resolve to save their beloved bai...
2    Based on Terry McMillan's novel, this film fol...
3    George Banks must deal not only with his daugh...
4    A group of high-end professional thieves start...
5    An ugly duckling having undergone a remarkable...
6    A former fireman takes on a group of terrorist...
7    When a powerful secret defense system is stole...
8    A widowed U.S. President running for reelectio...
9    Mel Brooks' parody of the classic vampire stor...
Name: sinopse, dtype: object

## Limpeza e pré-processamento

### Dividindo o texto em sentenças e palavras

In [9]:
# Sentenças
sentences = data_features.apply(sent_tokenize)
sentences

0        [When two kids find and play a magical board g...
1        [John and Max resolve to save their beloved ba...
2        [Based on Terry McMillan's novel, this film fo...
3        [George Banks must deal not only with his daug...
4        [A group of high-end professional thieves star...
                               ...                        
30353    ["I'll look at you, but not at the camera., It...
30354    [A musical biography of the great Russian clas...
30355    [An evil genius uses poison gas to avenge hims...
30356    [A young man narrates his past on how his fath...
30357    [Female prisoners are shipped to Devil's Islan...
Name: sinopse, Length: 30358, dtype: object

In [10]:
# Palavras
tokens = data_features.apply(word_tokenize)
tokens

0        [When, two, kids, find, and, play, a, magical,...
1        [John, and, Max, resolve, to, save, their, bel...
2        [Based, on, Terry, McMillan, 's, novel, ,, thi...
3        [George, Banks, must, deal, not, only, with, h...
4        [A, group, of, high-end, professional, thieves...
                               ...                        
30353    [``, I, 'll, look, at, you, ,, but, not, at, t...
30354    [A, musical, biography, of, the, great, Russia...
30355    [An, evil, genius, uses, poison, gas, to, aven...
30356    [A, young, man, narrates, his, past, on, how, ...
30357    [Female, prisoners, are, shipped, to, Devil, '...
Name: sinopse, Length: 30358, dtype: object

### Converter o texto para letras minúsculas

In [11]:
for i in range(len(tokens)):
   tokens[i] = [x.lower() for x in tokens[i]]
tokens

0        [when, two, kids, find, and, play, a, magical,...
1        [john, and, max, resolve, to, save, their, bel...
2        [based, on, terry, mcmillan, 's, novel, ,, thi...
3        [george, banks, must, deal, not, only, with, h...
4        [a, group, of, high-end, professional, thieves...
                               ...                        
30353    [``, i, 'll, look, at, you, ,, but, not, at, t...
30354    [a, musical, biography, of, the, great, russia...
30355    [an, evil, genius, uses, poison, gas, to, aven...
30356    [a, young, man, narrates, his, past, on, how, ...
30357    [female, prisoners, are, shipped, to, devil, '...
Name: sinopse, Length: 30358, dtype: object

### Removendo símbolos de pontuação de cada Token

In [12]:
table = str.maketrans('', '', string.punctuation)

for i in range(len(tokens)):
   tokens[i] = [x.translate(table) for x in tokens[i]]
tokens

0        [when, two, kids, find, and, play, a, magical,...
1        [john, and, max, resolve, to, save, their, bel...
2        [based, on, terry, mcmillan, s, novel, , this,...
3        [george, banks, must, deal, not, only, with, h...
4        [a, group, of, highend, professional, thieves,...
                               ...                        
30353    [, i, ll, look, at, you, , but, not, at, the, ...
30354    [a, musical, biography, of, the, great, russia...
30355    [an, evil, genius, uses, poison, gas, to, aven...
30356    [a, young, man, narrates, his, past, on, how, ...
30357    [female, prisoners, are, shipped, to, devil, s...
Name: sinopse, Length: 30358, dtype: object

### "Convertendo" caracteres especiais

In [13]:
for i in range(len(tokens)):
  tokens[i] = [unidecode.unidecode(x) for x in tokens[i]]

tokens


0        [when, two, kids, find, and, play, a, magical,...
1        [john, and, max, resolve, to, save, their, bel...
2        [based, on, terry, mcmillan, s, novel, , this,...
3        [george, banks, must, deal, not, only, with, h...
4        [a, group, of, highend, professional, thieves,...
                               ...                        
30353    [, i, ll, look, at, you, , but, not, at, the, ...
30354    [a, musical, biography, of, the, great, russia...
30355    [an, evil, genius, uses, poison, gas, to, aven...
30356    [a, young, man, narrates, his, past, on, how, ...
30357    [female, prisoners, are, shipped, to, devil, s...
Name: sinopse, Length: 30358, dtype: object

### Removendo o que não são palavras

Talvez seja necessário usar alguns tokens númericos, mas tenho que olhar depois

In [14]:
words = list()
for i in range(len(tokens)):
  words.append([x for x in tokens[i] if x.isalpha()])
words

[['when',
  'two',
  'kids',
  'find',
  'and',
  'play',
  'a',
  'magical',
  'board',
  'game',
  'they',
  'release',
  'a',
  'man',
  'trapped',
  'in',
  'it',
  'for',
  'decades',
  'and',
  'a',
  'host',
  'of',
  'dangers',
  'that',
  'can',
  'only',
  'be',
  'stopped',
  'by',
  'finishing',
  'the',
  'game'],
 ['john',
  'and',
  'max',
  'resolve',
  'to',
  'save',
  'their',
  'beloved',
  'bait',
  'shop',
  'from',
  'turning',
  'into',
  'an',
  'italian',
  'restaurant',
  'just',
  'as',
  'its',
  'new',
  'female',
  'owner',
  'catches',
  'max',
  's',
  'attention'],
 ['based',
  'on',
  'terry',
  'mcmillan',
  's',
  'novel',
  'this',
  'film',
  'follows',
  'four',
  'very',
  'different',
  'africanamerican',
  'women',
  'and',
  'their',
  'relationships',
  'with',
  'men'],
 ['george',
  'banks',
  'must',
  'deal',
  'not',
  'only',
  'with',
  'his',
  'daughter',
  's',
  'pregnancy',
  'but',
  'also',
  'with',
  'his',
  'wife',
  's'],


### Removendo stop words

São palavras que são consideradas irrelevantes para os resultados de uma busca

In [15]:
stop_words = set(stopwords.words('english'))
stop_words #dicionário de stop words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [16]:
#lista words sem as stop words
for i in range(len(words)):
  words[i] = [word for word in words[i] if not word in stop_words]
words

[['two',
  'kids',
  'find',
  'play',
  'magical',
  'board',
  'game',
  'release',
  'man',
  'trapped',
  'decades',
  'host',
  'dangers',
  'stopped',
  'finishing',
  'game'],
 ['john',
  'max',
  'resolve',
  'save',
  'beloved',
  'bait',
  'shop',
  'turning',
  'italian',
  'restaurant',
  'new',
  'female',
  'owner',
  'catches',
  'max',
  'attention'],
 ['based',
  'terry',
  'mcmillan',
  'novel',
  'film',
  'follows',
  'four',
  'different',
  'africanamerican',
  'women',
  'relationships',
  'men'],
 ['george', 'banks', 'must', 'deal', 'daughter', 'pregnancy', 'also', 'wife'],
 ['group',
  'highend',
  'professional',
  'thieves',
  'start',
  'feel',
  'heat',
  'lapd',
  'unknowingly',
  'leave',
  'verbal',
  'clue',
  'latest',
  'heist'],
 ['ugly',
  'duckling',
  'undergone',
  'remarkable',
  'change',
  'still',
  'harbors',
  'feelings',
  'crush',
  'carefree',
  'playboy',
  'businessfocused',
  'brother',
  'something',
  'say'],
 ['former',
  'fireman'

### Stemming

 Stemming é uma técnica usada para reduzir uma palavra flexionada ao seu radical

In [17]:
porter = PorterStemmer()
stemmed = list()
for i in range(len(words)):
  stemmed.append([porter.stem(word) for word in words[i]])
res = list(zip(words, stemmed)) #comparando as listas para ver radical e palavra flexionada
res


[(['two',
   'kids',
   'find',
   'play',
   'magical',
   'board',
   'game',
   'release',
   'man',
   'trapped',
   'decades',
   'host',
   'dangers',
   'stopped',
   'finishing',
   'game'],
  ['two',
   'kid',
   'find',
   'play',
   'magic',
   'board',
   'game',
   'releas',
   'man',
   'trap',
   'decad',
   'host',
   'danger',
   'stop',
   'finish',
   'game']),
 (['john',
   'max',
   'resolve',
   'save',
   'beloved',
   'bait',
   'shop',
   'turning',
   'italian',
   'restaurant',
   'new',
   'female',
   'owner',
   'catches',
   'max',
   'attention'],
  ['john',
   'max',
   'resolv',
   'save',
   'belov',
   'bait',
   'shop',
   'turn',
   'italian',
   'restaur',
   'new',
   'femal',
   'owner',
   'catch',
   'max',
   'attent']),
 (['based',
   'terry',
   'mcmillan',
   'novel',
   'film',
   'follows',
   'four',
   'different',
   'africanamerican',
   'women',
   'relationships',
   'men'],
  ['base',
   'terri',
   'mcmillan',
   'novel',
   'fil