In [2]:
import json
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import contractions

stop_words = nltk.corpus.stopwords.words('english')

## DATA INGESTION

In [42]:
def load_data(json_file):
    # Load JSON data
    with open(json_file, 'r') as f:
        data = json.load(f)
    # Convert to DataFrame
    users = pd.DataFrame(data['users'])
    videos = pd.DataFrame(data['videos'])
    return users, videos

In [43]:
users, videos = load_data('data.json')

In [44]:
users.head()

Unnamed: 0,user_id,name,watch_history
0,1,Alice Johnson,"[101, 102, 103, 104, 105, 106, 107, 108, 109, ..."
1,2,Bob Smith,"[121, 122, 123, 124, 125, 126, 127, 128, 129, ..."
2,3,Carol Williams,"[145, 146, 147, 148, 149, 150, 101, 102, 103, ..."
3,4,David Brown,"[115, 116, 117, 118, 119, 120, 121, 122, 123, ..."
4,5,Emma Davis,"[135, 136, 137, 138, 139, 140, 141, 142, 143, ..."


In [45]:
videos.head()

Unnamed: 0,video_id,title,category,tags,duration
0,101,Introduction to Python,Education,"[python, programming, tutorial]",15:32
1,102,Advanced Python Techniques,Education,"[python, programming, advanced]",22:45
2,103,Python for Data Science,Education,"[python, data science, tutorial]",19:56
3,104,Machine Learning Basics,Education,"[machine learning, AI, tutorial]",25:11
4,105,Neural Networks Explained,Education,"[neural networks, AI, tutorial]",28:34


## EDA

#### Check for Null values

In [46]:
users.isnull().sum()

user_id          0
name             0
watch_history    0
dtype: int64

In [47]:
videos.isnull().sum()

video_id    0
title       0
category    0
tags        0
duration    0
dtype: int64

## DATA PREPROCESIING

In [48]:
def preprocess_text(text):
    # Remove non-alphanumeric characters, strip whitespace, and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = text.strip()
    text = contractions.fix(text)
    # tokenize document
    tokens = nltk.word_tokenize(text)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    text = ' '.join(filtered_tokens)
    return text

## FEATURE ENGINEERING

In [49]:
videos['processed_title'] = videos['title'].apply(preprocess_text)

In [50]:
videos.head()

Unnamed: 0,video_id,title,category,tags,duration,processed_title
0,101,Introduction to Python,Education,"[python, programming, tutorial]",15:32,introduction python
1,102,Advanced Python Techniques,Education,"[python, programming, advanced]",22:45,advanced python techniques
2,103,Python for Data Science,Education,"[python, data science, tutorial]",19:56,python data science
3,104,Machine Learning Basics,Education,"[machine learning, AI, tutorial]",25:11,machine learning basics
4,105,Neural Networks Explained,Education,"[neural networks, AI, tutorial]",28:34,neural networks explained


## FEATURE EXTRACTION

In [51]:
def calculate_tfidf(documents):
    # Calculate TF-IDF vectors for the documents
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    return tfidf_matrix

In [52]:
tfidf_matrix = calculate_tfidf(videos['processed_title'])

## DOCUMENT SIMILARITY COMPUTATION

In [53]:
def get_user_watch_history(user_id):
    # Retrieve the watch history for a given user
    user_history = users[users['user_id'] == user_id]['watch_history'].values
    if user_history:
        return user_history[0]
    return []

In [54]:
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

## MODELING

In [63]:
def recommend_videos(user_id, top_n):
    # Generate video recommendations for a user
    user_name = users[users['user_id'] == user_id]['name'].values[0]
    watch_history = get_user_watch_history(user_id)
    if not watch_history:
        return []

    watched_indices = videos[videos['video_id'].isin(watch_history)].index.tolist()
    if not watched_indices:
        return []

    similarity_scores = similarity_matrix[watched_indices].mean(axis=0)
    videos['similarity_score'] = similarity_scores

    recommended_videos = videos[~videos['video_id'].isin(watch_history)].sort_values(by='similarity_score', ascending=False)
    recommend_video_indices = recommended_videos['video_id'].head(top_n).tolist()
    recommend_video_titles = recommended_videos['title'].head(top_n).tolist()
    
    return recommend_video_indices, recommend_video_titles, user_name


In [65]:
if __name__ == "__main__":
    while True:
        try:
            user_id = int(input("Enter user ID between 1 and 20: "))
            if 1 <= user_id <= 20:
                print("Valid user ID")
                n = int(input("Enter number of recommendations: "))
                recommendations = recommend_videos(user_id, n)

                print(f'\nTop {n} recommended videos for {recommendations[2]} with ID {user_id}:\
                    \nVideo Titles: {recommendations[1]}\nVideo IDs: {recommendations[0]}')
                break
            else:
                print("Invalid user ID. Please enter a value between 1 and 20") 
        except ValueError:
            print("Invalid input. Please enter an integer value")

Invalid input. Please enter an integer value
Invalid input. Please enter an integer value
Invalid input. Please enter an integer value
Invalid user ID. Please enter a value between 1 and 20
Invalid user ID. Please enter a value between 1 and 20
Valid user ID

Top 3 recommended videos for David Brown with ID 4:                    
Video Titles: ['React for Beginners', 'Hadoop for Beginners', 'Machine Learning Basics']
Video IDs: [108, 144, 104]
