In [6]:
import pandas as pd
import numpy as np
import scipy as sc
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Esteban\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
# Creamos una función de preprocesamiento para el texto
def preprocess_text(text):
    # Eliminamos caracteres especiales y puntuación
    text = re.sub(r'[^\w\s]', '', text)
    # Convertimos a minúsculas
    text = text.lower()
    # Se eliminan stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text


In [11]:
# Se carga el dataset
df_ml = pd.read_csv('../Dataset_MVP/data_depurado.csv')
# Aplicamos el preprocesamiento al título en base a la "nube de palabras"
df_ml['title_processed'] = df_ml['title'].apply(preprocess_text)
print(df_ml['title_processed'])

0                   toy story
1                     jumanji
2            grumpier old men
3              waiting exhale
4        father bride part ii
                 ...         
45447              robin hood
45448        century birthing
45449                betrayal
45450        satan triumphant
45451                queerama
Name: title_processed, Length: 45452, dtype: object


In [13]:
# Vectorización
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_ml['title_processed'])
print(tfidf_matrix)

  (0, 20052)	0.5339829726459521
  (0, 21364)	0.8454952305744796
  (1, 11072)	1.0
  (2, 13472)	0.44216862921234484
  (2, 15049)	0.4958159895054748
  (2, 9018)	0.7474312061261443
  (3, 7148)	0.7982678793737356
  (3, 22598)	0.6023025757542126
  (4, 10212)	0.5041106991164298
  (4, 3055)	0.6113284332071218
  (4, 7398)	0.6100409410759824
  (5, 9508)	1.0
  (6, 18026)	1.0
  (7, 10020)	0.8034375654663235
  (7, 21230)	0.5953890143406637
  (8, 5518)	0.5386812876228796
  (8, 20231)	0.8425096262743569
  (9, 8741)	1.0
  (10, 16498)	0.8158063391416447
  (10, 964)	0.5783251827616602
  (11, 12568)	0.6635345109196596
  (11, 5493)	0.4453138767029794
  (11, 6301)	0.6011800928460387
  (12, 1871)	1.0
  (13, 14721)	1.0
  :	:
  (45438, 13227)	0.47020458395233206
  (45439, 5587)	0.6895294168311971
  (45439, 9503)	0.7242576774356135
  (45440, 13904)	1.0
  (45441, 14058)	1.0
  (45442, 17802)	0.6755735113500229
  (45442, 13587)	0.5250754096891398
  (45442, 19816)	0.5175869442924373
  (45443, 9938)	0.8373810853074

In [17]:
# Guardamos el DataFrame y el modelo entrenado
df_ml.to_csv('../Dataset_MVP/dataframe_ml.csv', index=False)
print(df_ml)
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')
joblib.dump(tfidf_matrix, 'tfidf_matrix.joblib')    


       belongs_to_collection      budget                      genres      id  \
0                    10194.0  30000000.0   Animation, Comedy, Family     862   
1                        NaN  65000000.0  Adventure, Fantasy, Family    8844   
2                   119050.0         0.0             Romance, Comedy   15602   
3                        NaN  16000000.0      Comedy, Drama, Romance   31357   
4                    96871.0         0.0                      Comedy   11862   
...                      ...         ...                         ...     ...   
45447                    NaN         0.0      Drama, Action, Romance   30840   
45448                    NaN         0.0                       Drama  111109   
45449                    NaN         0.0     Action, Drama, Thriller   67758   
45450                    NaN         0.0                         NaN  227506   
45451                    NaN         0.0                         NaN  461257   

      original_language                

['tfidf_matrix.joblib']