# Friends TV Show Script **Word2Vec**

In [1]:
# Import required libraries

import gensim
import os
import nltk
import matplotlib.pyplot as plt

## Load Dataset

In [2]:
# Load Data - Friends TV show Script

import kagglehub
path = kagglehub.dataset_download("divyansh22/friends-tv-show-script")

# print(os.listdir(path))
script = open(os.path.join(path, "Friends_Transcript.txt"), "r").read()

In [3]:
script[:500]

"THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\nWritten by: Marta Kauffman & David Crane\n[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]\nMonica: There's nothing to tell! He's just some guy I work with!\nJoey: C'mon, you're going out with the guy! There's gotta be something wrong with him!\nChandler: All right Joey, be nice. So does he have a hump? A hump and a hairpiece?\nPhoebe: Wait, does he eat chalk?\n(They all stare, bemused.)\nPhoebe: Just, 'cause, I d"

## Data Preprocessing

In [4]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Data Preprocessing

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):

    # Tokenization
    sentences = sent_tokenize(text)
    words = [word_tokenize(sentence.lower()) for sentence in sentences]

    # Stop words removal
    stop_words = set(stopwords.words('english'))
    words = [[word for word in sentence if word not in stop_words] for sentence in words]

    # Remove special characters
    words = [[word for word in sentence if word.isalpha()] for sentence in words]

    # Convert to lowercase
    words = [[word.lower() for word in sentence] for sentence in words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in words]

    return words

words = preprocess_text(script)

In [6]:
words[2:4]

[['guy', 'work'], ['joey', 'going', 'guy']]

## Building the Word2Vec model

In [7]:
# Define model
model = gensim.models.Word2Vec(
    words, # Sentences
    vector_size=100, # Dimention of the output vector
    window=2, # How many left and right words to be considered
    min_count=5, #takes sentences where mininum word count is 5.
)


In [8]:
# Build Vocablory
model.build_vocab(words)



In [9]:
# model.corpus_count
model.epochs

5

In [10]:
# Training of model

model.train(words,total_examples = model.corpus_count, epochs = 5)



(1792902, 2418945)

## Find most similar words

In [11]:
model.wv.most_similar('monica')

[('joey', 0.7726795673370361),
 ('phoebe', 0.7702150940895081),
 ('rachel', 0.7562381625175476),
 ('janice', 0.7543542981147766),
 ('ross', 0.7431071400642395),
 ('mona', 0.7319304943084717),
 ('paul', 0.7306249141693115),
 ('kathy', 0.7302491664886475),
 ('charlie', 0.7053934335708618),
 ('mike', 0.6950560212135315)]

In [12]:
model.wv.most_similar('cat')

[('pot', 0.8759946823120117),
 ('ice', 0.8674176931381226),
 ('smelly', 0.8608942627906799),
 ('holy', 0.8451147079467773),
 ('sharing', 0.844830334186554),
 ('jungle', 0.8442457318305969),
 ('misunderstanding', 0.8346870541572571),
 ('complain', 0.83254474401474),
 ('pour', 0.8288383483886719),
 ('broom', 0.8274937868118286)]

## Find Doesnt match words

In [13]:
model.wv.doesnt_match(['monica', 'chandler', 'joey', 'cat'])

'cat'

In [14]:
model.wv.doesnt_match(['monica', 'movie', 'gary', 'ross'])

'movie'

## Find Similarity score

In [15]:
model.wv.similarity("monica","ross")

0.7431071

In [16]:
model.wv.similarity("monica","joey")

0.77267957

In [17]:
model.wv.similarity("coffee","cat")

0.12260482

In [18]:
# Get vector of any word
model.wv.get_vector("rachel")

array([-0.825656  ,  1.5502739 , -0.76927394, -0.12953071, -0.11902016,
       -1.0517694 , -0.97604966,  0.9054335 , -0.28895146,  0.17651233,
        0.04248256, -0.20980154, -0.30248788, -0.45991918,  0.5002718 ,
       -0.19180182,  0.06584375, -0.06865277, -0.12753902, -0.4421682 ,
        0.30136   ,  0.19693486,  0.00475379,  0.06420079,  0.36398113,
        0.07632197, -0.51185375,  0.18248558, -0.29584152,  0.20999874,
       -0.05706926,  0.06218879, -0.04943893, -0.00947373,  0.29875487,
        0.461261  , -0.30293012,  0.88229024, -0.08200062, -0.48991215,
       -0.01435658, -0.37155044, -0.25530323,  0.13453002,  0.08157101,
       -1.0865589 ,  0.31390375, -0.38369265,  0.25985447, -0.14108673,
       -0.3489474 , -0.05120184,  0.7882753 , -0.03596024,  0.08146546,
        0.33362263,  0.25838217,  0.2592136 , -0.4490116 ,  0.9189983 ,
        0.5923373 , -0.14294447, -0.54565066, -0.08494864, -0.2389199 ,
        0.48114204,  0.465541  ,  0.17845674, -0.11038305,  0.13

## Visualizing vectors in 3d space

In [19]:
model.wv.get_normed_vectors() # return every word in vector form

# shape = (5652, 100)

array([[-0.08327463,  0.32022372, -0.13381226, ..., -0.22148588,
        -0.18794797,  0.02537314],
       [-0.17761612,  0.3334968 , -0.16548714, ..., -0.15767682,
        -0.07135279,  0.04985169],
       [-0.1952275 ,  0.32558906, -0.06452323, ..., -0.23461223,
        -0.14334719,  0.01332051],
       ...,
       [-0.02610012,  0.16980524,  0.1598669 , ..., -0.08739871,
         0.00347443, -0.00784669],
       [-0.00431081,  0.16099946,  0.19351323, ..., -0.2021135 ,
         0.01298882, -0.04634085],
       [-0.01074887,  0.13033347,  0.17861912, ..., -0.15425591,
        -0.00231369, -0.0014952 ]], dtype=float32)

In [20]:
model.wv.index_to_key[:5] # return words index wise mapped to above vectors

# total length = 5652

['ross', 'rachel', 'chandler', 'joey', 'monica']

In [21]:
# Using PCA to reduce dimentions from 100 to 3

from sklearn.decomposition import PCA

pca = PCA(n_components=3)
vectors_3d = pca.fit_transform(model.wv.get_normed_vectors())

In [22]:
import plotly.express as px

fig = px.scatter_3d(vectors_3d[:50],color=model.wv.index_to_key[:50], x = 0, y = 1, z = 2,title="Word2Vec in 3d")
fig.show()