In [76]:
import string
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [77]:
movies_df = pd.read_csv (r'../raw_data/nico_movie_recommender.csv').copy()
movies_df.drop('Unnamed: 0', inplace=True, axis=1)

In [78]:
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer()

In [79]:
def cleaning(series):
    sentence = series.strip()
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')
    
    sentence = word_tokenize(sentence)
    
    tokens_cleaned = [word for word in sentence if not word in stop_words]
    
    lemmatized_sentence = []
    
    for i in tokens_cleaned:
            lemmatized = lemmatizer.lemmatize(i)
            lemmatized_sentence.append(lemmatized)
            
#     " ".join(lemmatized_sentence)
    
    return lemmatized_sentence

In [80]:
movies_df[['cleaned_texts']] = movies_df[['text']].applymap(cleaning)

In [81]:
def vectorizer(series):
    tf_idf_vectorizer = TfidfVectorizer()
    vector_txts = tf_idf_vectorizer.fit_transform(series)
    return vector_txts.toarray()

In [82]:
movies_df['vectorized_texts'] = movies_df['cleaned_texts'].apply(vectorizer)

In [83]:
movies_df.keys()

Index(['title', 'genre', 'duration', 'text', 'cleaned_texts',
       'vectorized_texts'],
      dtype='object')

In [71]:
movies_df

Unnamed: 0.1,Unnamed: 0,title,genre,duration,text,cleaned_texts,vectorized_texts
0,0,The Shawshank Redemption,Drama,142,Two imprisoned men bond over a number of years...,"[two, imprisoned, men, bond, number, year, fin...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,1,The Godfather,"Crime, Drama",175,The aging patriarch of an organized crime dyna...,"[aging, patriarch, organized, crime, dynasty, ...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,2,The Dark Knight,"Action, Crime, Drama",152,When the menace known as the Joker wreaks havo...,"[menace, known, joker, wreaks, havoc, chaos, p...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,3,The Lord of the Rings: The Return of the King,"Action, Adventure, Drama",201,Gandalf and Aragorn lead the World of Men agai...,"[gandalf, aragorn, lead, world, men, saurons, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,..."
4,4,Schindler's List,"Biography, Drama, History",195,"In German-occupied Poland during World War II,...","[germanoccupied, poland, world, war, ii, indus...","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...
7145,7145,Monster a Go-Go,"Horror, Sci-F",68,"A space capsule crash-lands on Earth, and the ...","[space, capsule, crashlands, earth, astronaut,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7146,7146,Pledge This!,Comedy,91,"At South Beach University, a beautiful sororit...","[south, beach, university, beautiful, sorority...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7147,7147,Manos: The Hands of Fate,Horror,70,A family gets lost on the road and stumbles up...,"[family, get, lost, road, stumble, upon, hidde...","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7148,7148,Superbabies: Baby Geniuses 2,"Comedy, Family, Sci-F",88,A group of smart-talking toddlers find themsel...,"[group, smarttalking, toddler, find, center, m...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,..."


In [61]:
movies_df #-------------->> without toarray() maybe might be better for cosine not sure need to investigate

Unnamed: 0.1,Unnamed: 0,title,genre,duration,text,cleaned_texts,vectorized_texts
0,0,The Shawshank Redemption,Drama,142,Two imprisoned men bond over a number of years...,"[two, imprisoned, men, bond, number, year, fin...","(0, 11)\t1.0\n (1, 6)\t1.0\n (2, 7)\t1.0\n..."
1,1,The Godfather,"Crime, Drama",175,The aging patriarch of an organized crime dyna...,"[aging, patriarch, organized, crime, dynasty, ...","(0, 0)\t1.0\n (1, 9)\t1.0\n (2, 8)\t1.0\n ..."
2,2,The Dark Knight,"Action, Crime, Drama",152,When the menace known as the Joker wreaks havo...,"[menace, known, joker, wreaks, havoc, chaos, p...","(0, 11)\t1.0\n (1, 10)\t1.0\n (2, 9)\t1.0\..."
3,3,The Lord of the Rings: The Return of the King,"Action, Adventure, Drama",201,Gandalf and Aragorn lead the World of Men agai...,"[gandalf, aragorn, lead, world, men, saurons, ...","(0, 6)\t1.0\n (1, 1)\t1.0\n (2, 8)\t1.0\n ..."
4,4,Schindler's List,"Biography, Drama, History",195,"In German-occupied Poland during World War II,...","[germanoccupied, poland, world, war, ii, indus...","(0, 2)\t1.0\n (1, 10)\t1.0\n (2, 15)\t1.0\..."
...,...,...,...,...,...,...,...
7145,7145,Monster a Go-Go,"Horror, Sci-F",68,"A space capsule crash-lands on Earth, and the ...","[space, capsule, crashlands, earth, astronaut,...","(0, 12)\t1.0\n (1, 3)\t1.0\n (2, 5)\t1.0\n..."
7146,7146,Pledge This!,Comedy,91,"At South Beach University, a beautiful sororit...","[south, beach, university, beautiful, sorority...","(0, 10)\t1.0\n (1, 1)\t1.0\n (2, 13)\t1.0\..."
7147,7147,Manos: The Hands of Fate,Horror,70,A family gets lost on the road and stumbles up...,"[family, get, lost, road, stumble, upon, hidde...","(0, 2)\t1.0\n (1, 4)\t1.0\n (2, 7)\t1.0\n ..."
7148,7148,Superbabies: Baby Geniuses 2,"Comedy, Family, Sci-F",88,A group of smart-talking toddlers find themsel...,"[group, smarttalking, toddler, find, center, m...","(0, 7)\t1.0\n (1, 13)\t1.0\n (2, 16)\t1.0\..."


In [49]:
vectorized_texts[0].shape

(13, 13)

In [21]:
vectorizer = TfidfVectorizer()
vectorized_documents = vectorizer.fit_transform(cleaned_texts)
vectorized_documents = pd.DataFrame(vectorized_documents.toarray(), 
                                    columns = vectorizer.get_feature_names_out())
vectorized_documents

Unnamed: 0,aa,aang,aaron,aback,abandon,abandoned,abandoning,abandonment,abba,abbie,...,zoozie,zorba,zordon,zorel,zorro,zuckerberg,zuko,zurg,zé,æon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
movies_df.to_csv('vectorized_texts.csv')