Text_Based_Search_Functionality

In [None]:
pip install pandas sentence-transformers scikit-learn fuzzywuzzy python-Levenshtein pyspellchecker

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
from fuzzywuzzy import process
import numpy as np
import re
from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


spell = SpellChecker()
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


videos = pd.read_csv('videos.csv')

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

def stem_words(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

videos['title'] = videos['title'].apply(clean_text).apply(remove_stopwords).apply(stem_words)
videos['description'] = videos['description'].apply(clean_text).apply(remove_stopwords).apply(stem_words)
videos['keyword'] = videos['keyword'].apply(clean_text).apply(remove_stopwords).apply(stem_words)

videos['tags'] = videos['description'] + ' ' + videos['genre'] + ' ' + videos['keyword']

new_videos_data = videos[['id', 'title', 'tags']]

new_videos_data = new_videos_data.drop_duplicates(subset='id').dropna(subset=['title', 'tags'])

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(new_videos_data['tags'].tolist(), show_progress_bar=True)

similarityv = cosine_similarity(embeddings)
joblib.dump(similarityv, 'similarity_matrix.pkl')
new_videos_data.to_csv('videos_data.csv', index=False)

def preprocess_input(keyword):
    keyword = keyword.lower()
    keyword = re.sub(r'[^\w\s]', '', keyword)
    keyword = re.sub(r'\s+', ' ', keyword).strip()
    return keyword

def correct_spelling(keyword):
    corrected = []
    for word in keyword.split():
        if spell.unknown([word]):
            candidates = spell.candidates(word)
            corrected.append(next(iter(candidates), word) if candidates else word)
        else:
            corrected.append(word)
    return ' '.join(corrected)

def recommend_videos(keyword, num_results=10):
    similarityv = joblib.load('similarity_matrix.pkl')
    new_videos_data = pd.read_csv('videos_data.csv')

    keywords = [preprocess_input(k) for k in keyword.split()]
    keywords_corrected = [correct_spelling(k) for k in keywords]

    filters = [new_videos_data['title'].str.contains(kw, case=False, na=False) |
               new_videos_data['tags'].str.contains(kw, case=False, na=False) for kw in keywords_corrected]

    combined_filter = filters[0]
    for f in filters[1:]:
        combined_filter |= f

    filtered_videos = new_videos_data[combined_filter]

    if filtered_videos.empty:
        titles = new_videos_data['title'].tolist()
        closest_match = process.extractOne(keyword, titles)
        if closest_match[1] < 10:
            print(f"No videos found containing the keyword '{keyword}'.")
            return
        else:
            print(f"No videos found containing the keyword '{keyword}'. Did you mean '{closest_match[0]}'?")
            return recommend_videos(closest_match[0], num_results)

    print(f"Videos containing the keywords '{keyword}':")

    filtered_videos['keyword_match_score'] = filtered_videos.apply(
        lambda row: sum(1.5 if kw in row['title'].lower() else 1.0 if kw in row['tags'].lower() else 0 for kw in keywords_corrected), axis=1
    )

    total_similarity = np.zeros(len(new_videos_data))
    for index in filtered_videos.index:
        total_similarity[index] = similarityv[index].sum()

    filtered_videos['similarity_score'] = filtered_videos.index.map(lambda idx: total_similarity[idx])

    filtered_videos = filtered_videos.sort_values(by=['keyword_match_score', 'similarity_score'], ascending=[False, False])

    for _, row in filtered_videos.head(num_results).iterrows():
        print(f"ID: {row['id']}, Title: {row['title']} (Keyword Score: {row['keyword_match_score']}, Similarity: {row['similarity_score']:.2f})")

    print("\nAdditional similar videos:")
    distance = sorted(enumerate(total_similarity), key=lambda x: x[1], reverse=True)

    for i in distance:
        if i[0] not in filtered_videos.index and i[0] < len(new_videos_data):
            recommended_video = new_videos_data.iloc[i[0]]
            print(f"ID: {recommended_video['id']}, Title: {recommended_video['title']} (Similarity: {i[1]:.2f})")

recommend_videos("ADIPURUSH CELEBRATING VICTORY OF GOOD OVER EVIL Music Label: T-Series")


TESTING ACCURACY

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score
import joblib
from fuzzywuzzy import process
import numpy as np
import re
from spellchecker import SpellChecker

spell = SpellChecker()

videos = pd.read_csv('videos.csv')

videos = videos[['id', 'title', 'description', 'keyword', 'genre']]
videos['tags'] = videos['description'] + ' ' + videos['genre'] + ' ' + videos['keyword']

new_videos_data = videos.drop(columns=['description', 'genre', 'keyword'])

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(new_videos_data['tags'].tolist(), show_progress_bar=True)

similarityv = cosine_similarity(embeddings)
joblib.dump(similarityv, 'similarity_matrix.pkl')
new_videos_data.to_csv('videos_data.csv', index=False)

def preprocess_input(keyword):
    keyword = keyword.lower()
    keyword = re.sub(r'[^\w\s]', '', keyword)
    keyword = re.sub(r'\s+', ' ', keyword).strip()
    return keyword

def correct_spelling(keyword):
    corrected = []
    for word in keyword.split():
        if spell.unknown([word]):
            candidates = spell.candidates(word)
            corrected.append(next(iter(candidates), word) if candidates else word)
        else:
            corrected.append(word)
    return ' '.join(corrected)

def recommend_videos(keyword, num_results, ground_truth_ids):
    similarityv = joblib.load('similarity_matrix.pkl')
    new_videos_data = pd.read_csv('videos_data.csv')

    keywords = [preprocess_input(k) for k in keyword.split()]
    keywords_corrected = [correct_spelling(k) for k in keywords]

    filters = [new_videos_data['title'].str.contains(kw, case=False, na=False) |
               new_videos_data['tags'].str.contains(kw, case=False, na=False) for kw in keywords_corrected]

    combined_filter = filters[0]
    for f in filters[1:]:
        combined_filter |= f

    filtered_videos = new_videos_data[combined_filter]

    if filtered_videos.empty:
        titles = new_videos_data['title'].tolist()
        closest_match = process.extractOne(keyword, titles)
        if closest_match[1] < 10:
            print(f"No videos found containing the keyword '{keyword}'.")
            return
        else:
            print(f"No videos found containing the keyword '{keyword}'. Did you mean '{closest_match[0]}'?")
            return recommend_videos(closest_match[0], num_results, ground_truth_ids)

    print(f"Videos containing the keywords '{keyword}':")


    filtered_videos['keyword_match_score'] = filtered_videos.apply(
        lambda row: sum(1.5 if kw in row['title'].lower() else 1.0 if kw in row['tags'].lower() else 0 for kw in keywords_corrected), axis=1
    )

    total_similarity = np.zeros(len(new_videos_data))

    for index in filtered_videos.index:
        total_similarity[index] = similarityv[index].sum()

    filtered_videos['similarity_score'] = filtered_videos.index.map(lambda idx: total_similarity[idx])

    filtered_videos = filtered_videos.sort_values(by=['keyword_match_score', 'similarity_score'], ascending=[False, False])

    filtered_videos = filtered_videos.head(num_results)

    def evaluate_recommendations(filtered_videos, ground_truth_ids):
        recommended_ids = filtered_videos['id'].tolist()

        y_true = [1 if vid in ground_truth_ids else 0 for vid in recommended_ids]
        y_pred = [1] * len(recommended_ids)

        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        return precision, recall, f1

    precision, recall, f1 = evaluate_recommendations(filtered_videos, ground_truth_ids)
    print(f"\nEvaluation Metrics:\nPrecision: {precision:.2f}\nRecall: {recall:.2f}\nF1 Score: {f1:.2f}")

    for _, row in filtered_videos.iterrows():
        print(f"ID: {row['id']}, Title: {row['title']} (Keyword Score: {row['keyword_match_score']}, Similarity: {row['similarity_score']:.2f})")

    print("\nAdditional similar videos:")
    distance = sorted(enumerate(total_similarity), key=lambda x: x[1], reverse=True)

    for i in distance:
        if i[0] not in filtered_videos.index and i[0] < len(new_videos_data):
            recommended_video = new_videos_data.iloc[i[0]]
            print(f"ID: {recommended_video['id']}, Title: {recommended_video['title']} (Similarity: {i[1]:.2f})")

ground_truth_ids = [934772, 934773, 934774, 934775, 934776, 934777, 934778, 934779, 934780, 934781]
recommend_videos("Telugu songs", num_results=10, ground_truth_ids=ground_truth_ids)