In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Importing libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
data=pd.read_csv("/content/drive/MyDrive/sample_subtitle_data.csv")

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,num,name,content,file_content
0,19167,9258256,recipes.for.love.and.murder.s01.e06.breakfast....,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:07,360 --> 00:00:10,920\r\n [Mar..."
1,46043,9372418,gossip.girl.s02.e03.the.dark.night.(2008).eng.1cd,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:01,301 --> 00:00:03,582\r\n<i>Go..."
2,8587,9217311,top.gun.maverick.(2022).eng.1cd,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nUse t..."
3,39143,9341767,army.wives.s04.e09.new.orders.(2010).eng.1cd,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:02,185 --> 00:00:03,296\r\nROXY:..."
4,58010,9423884,the.exchange.s01.e01.bank.of.tomorrow.(2023).e...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"1\r\n00:00:07,480 --> 00:00:08,480\r\nFire!\r\..."


In [None]:
data=data.drop(columns=["Unnamed: 0","num","content"])

In [None]:
data=data.rename(columns={"name":"Series/Movie","file_content":"Subtitles"})

In [None]:
data.columns

Index(['Series/Movie', 'Subtitles'], dtype='object')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32999 entries, 0 to 32998
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Series/Movie  32999 non-null  object
 1   Subtitles     32999 non-null  object
dtypes: object(2)
memory usage: 515.7+ KB


In [None]:
#Text Preprocessing
def preprocess_text(text):
    text=text.lower()
    # Remove websites with www
    text = re.sub(r'www\.\S+', ' ', text)
    # Remove websites with https
    text = re.sub(r'http\S+', ' ', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove special characters
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', ' ', text)
    return text

In [None]:
data["Subtitles"]=data["Subtitles"].apply(preprocess_text)

In [None]:
data.to_csv("/content/drive/MyDrive/subtitle_data.csv")

In [None]:
def lemma_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

In [None]:
data["Subtitles"]=data["Subtitles"].apply(lemma_text)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32999 entries, 0 to 32998
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Sereis/Movie  32999 non-null  object
 1   Subtitles     32999 non-null  object
dtypes: object(2)
memory usage: 515.7+ KB


In [None]:
def remove_short_words(text):
    text = ' '.join(word for word in text.split() if len(word) > 3)
    
    return text

In [None]:
data["Subtitles"]=data["Subtitles"].astype(str).apply(lambda x : remove_short_words(x))

In [None]:
data["Series/Movie"]=data["Series/Movie"].apply(lambda x : re.sub("eng.1cd","",x))

In [None]:
data.to_csv("/content/drive/MyDrive/cleaned_subtitle_data1.csv")

In [None]:
data=pd.read_csv("/content/drive/MyDrive/cleaned_subtitle_data1.csv")

In [None]:
data.info()

In [None]:
data.head()

In [None]:
#Function to Tokenize the document
def tokenize_text(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens]
    return tokens

In [None]:
preprocessed_data = [tokenize_text(text) for text in data["Subtitles"].astype(str)]

In [None]:
word2vec_model = Word2Vec(sentences=preprocessed_data, vector_size=300,min_count=5)

In [None]:
#Saving word to vec model
word2vec_model.save("/content/drive/My Drive/word2vec_model.bin")

In [None]:
import joblib
import pickle

In [None]:
model=KeyedVectors.load("/content/drive/MyDrive/word2vec_model.bin")

In [None]:
model_file_path = "/content/drive/MyDrive/word2vec_model.joblib"
joblib.dump(model, model_file_path)

['/content/drive/MyDrive/word2vec_model.joblib']

In [None]:
# Define document vector
def document_vector(tokens, word2vec_model):
    document_vector = np.zeros(word2vec_model.vector_size)  # Initialize document vector
    count = 0  # Counter to keep track of valid word vectors
    for token in tokens:
        if token in word2vec_model.wv:
            document_vector += word2vec_model.wv[token]
            count += 1
    if count > 0:
        document_vector /= count  # Take the average of word vectors
    return document_vector



In [None]:
data['Subtitles'] = data['Subtitles'].fillna('')  # Replace missing values with an empty string

# Apply document vector calculation to 'Subtitles' column
data['Document_Vector'] = data['Subtitles'].apply(lambda x: document_vector(x.split(), model))

In [None]:
data.to_csv("/content/drive/MyDrive/final_data1.csv")

In [None]:
#Calculating cosine similairty
def calculate_similarity(query_vector, document_vectors):
    similarities = cosine_similarity([query_vector], document_vectors).flatten()
    return similarities

In [None]:
#define a function to find similar documents
def find_similar_documents(query, data, model):
    query_vector = document_vector(query, model)
    document_vectors = np.array(data['Document_Vector'].tolist())
    similarities = calculate_similarity(query_vector, document_vectors)
    sorted_indices = np.argsort(similarities)[::-1]
    similar_documents = data.iloc[sorted_indices]
    return similar_documents

In [None]:
query = "send me the required documents"
similar_documents = find_similar_documents(query, data, model)
print(similar_documents["Subtitles"].head(10))

13124    script info title english u original script hi...
12807    script info title english u scripttype v wraps...
18815    script info title english u scripttype v wraps...
948      script info title english u scripttype v wraps...
10696    script info title english u scripttype v wraps...
7505     script info script generated aegisub daydream ...
16242    script info script generated aegisub daydream ...
29847    script info script generated aegisub daydream ...
11855    script info script generated aegisub title sub...
25947    advertise product brand contact today tamam sa...
Name: Subtitles, dtype: object
