# Build Tiktok-Style Shorts Recommendation with just raw videos
Setup the requirements packages, Gemini API Key (Free) and Access to Google Cloud Storage

In [None]:
PROJECT_ID = 'YOUR-PROJECT-ID' # change this to your project ID

In [12]:
from google.cloud import storage

client = storage.Client(project=PROJECT_ID)
bucket_name = 'your-bucket-name' # change this to your bucket name

sdr_files = []
for blob in client.list_blobs(bucket_name, prefix='small'): # change the prefix to the folder you want to list
  sdr_files.append(str(blob.name))

print(len(sdr_files))

2659


In [13]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import random 

vertexai.init(project=PROJECT_ID, location="us-central1")
vision_model = GenerativeModel("gemini-1.5-flash-002")

results = []

# sampling 50 files
sdr_files_sample = random.sample(sdr_files, 50)

for index, short_file in enumerate(sdr_files_sample):
    path = f"gs://{bucket_name}/{short_file}"

    # Generate text
    response = vision_model.generate_content(
        [
            Part.from_uri(
                path, mime_type="video/mp4"
            ),
            """Watch each frame in video. Do not make up any information that is not part of the video. 
            Generate title of video, genres, short description and taggings of content in JSON format.

            Use this JSON schema:
            {
                "title": "string",
                "tags": ["string"],
                "genres": ["string"],
                "description": "string"
            }
            """
        ]
    )

    results.append({"path": path, "response": response.text})

In [14]:
import json 
import re

processed_results = []

for r in results:
    match = re.search(r'```json\n(.*?)\n```', r['response'], re.DOTALL)
    if match:
        cleaned_string = match.group(1).strip()
        try:
            json_object = json.loads(cleaned_string)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for {r['path']}")
            print(f"Error: {e}")
            print(f"Cleaned String: {cleaned_string}")
            continue
    else:
        print(f"No JSON content found for {r['path']}")

    # Append the processed result to the list
    processed_results.append({"path": r["path"], "response": json_object})

In [15]:
import pandas as pd 

df = pd.DataFrame(processed_results)
response_df = pd.json_normalize(df['response'])
final_df = pd.concat([df, response_df], axis=1)
final_df['file'] = final_df['path'].apply(lambda x: x.split('/')[-1])
final_df = final_df.drop(columns=['path', 'response'])
final_df.head()

Unnamed: 0,title,tags,genres,description,file
0,From which city are you watching?,"[women, dresses, formal wear, party, event, qu...","[Social Media, Short Video]",Five women in elegant dresses pose for a short...,SDR_Dance_nhiz.mp4
1,ਸਾਈਂ ਦਲਬੀਰ ਸਿੰਘ ਜੀ ਦਾ ਭਾਸ਼ਣ,"[ਸਾਈਂ ਦਲਬੀਰ ਸਿੰਘ, ਭਾਸ਼ਣ, NEWS18 ਪੰਜਾਬ, ਧਾਰਮਿਕ]","[News, Politics, Religious]",ਸਾਈਂ ਦਲਬੀਰ ਸਿੰਘ ਜੀ ਦਾ ਇੱਕ ਭਾਸ਼ਣ।,SDR_Society_wvxk.mp4
2,Serene River Flowing Through a Mountain Gorge,"[river, mountain, nature, water, gorge, serene...","[Nature, Travel]",A crystal-clear river flows through a lush mou...,SDR_Hobby_v4op.mp4
3,Minecraft Skyblock Gameplay,"[Minecraft, Skyblock, Gameplay, Gaming, Video ...","[Gaming, Adventure]",A short video showcasing gameplay of a Minecra...,SDR_Gameplay_xbqj.mp4
4,Stunning Home Tour: Modern Living Room with Gr...,"[home tour, modern design, interior design, li...","[Home and Garden, Real Estate]",A captivating video showcasing a beautifully d...,SDR_Hobby_6q15.mp4


In [19]:
import google.generativeai as genai
import os

genai.configure(api_key=os.environ["GEMINI_API_KEY"]) # change this to your API key

embeddings = []

for index, row in final_df.iterrows():
    text = f"""
       *Title:**  {row['title']} **Genres:** {', '.join(row['genres'])}
       **Short Description:** {row['description']}
       **Taggings:** {', '.join(row['tags'])}"""

    result = genai.embed_content(
       model="models/text-embedding-004",
       content=text)

    final_df.loc[index, 'embedding'] = str(result['embedding'])

final_df.head()

Unnamed: 0,title,tags,genres,description,file,embedding
0,From which city are you watching?,"[women, dresses, formal wear, party, event, qu...","[Social Media, Short Video]",Five women in elegant dresses pose for a short...,SDR_Dance_nhiz.mp4,"[-0.012115993, 0.022727795, 0.025023129, -0.04..."
1,ਸਾਈਂ ਦਲਬੀਰ ਸਿੰਘ ਜੀ ਦਾ ਭਾਸ਼ਣ,"[ਸਾਈਂ ਦਲਬੀਰ ਸਿੰਘ, ਭਾਸ਼ਣ, NEWS18 ਪੰਜਾਬ, ਧਾਰਮਿਕ]","[News, Politics, Religious]",ਸਾਈਂ ਦਲਬੀਰ ਸਿੰਘ ਜੀ ਦਾ ਇੱਕ ਭਾਸ਼ਣ।,SDR_Society_wvxk.mp4,"[0.0345794, 0.034678414, -0.06278941, -0.01626..."
2,Serene River Flowing Through a Mountain Gorge,"[river, mountain, nature, water, gorge, serene...","[Nature, Travel]",A crystal-clear river flows through a lush mou...,SDR_Hobby_v4op.mp4,"[-0.011046564, 0.01635406, -0.03524917, -0.002..."
3,Minecraft Skyblock Gameplay,"[Minecraft, Skyblock, Gameplay, Gaming, Video ...","[Gaming, Adventure]",A short video showcasing gameplay of a Minecra...,SDR_Gameplay_xbqj.mp4,"[-0.023567451, -0.031801246, -0.008186578, -0...."
4,Stunning Home Tour: Modern Living Room with Gr...,"[home tour, modern design, interior design, li...","[Home and Garden, Real Estate]",A captivating video showcasing a beautifully d...,SDR_Hobby_6q15.mp4,"[-0.04826035, -0.0449679, 0.029194266, -0.0118..."


In [20]:
import ast

final_df['embedding'] = final_df['embedding'].apply(lambda x: list(map(float, ast.literal_eval(x))))

In [21]:
import numpy as np

from sklearn.metrics.pairwise import pairwise_distances

# Function to calculate cosine similarity
def calculate_cosine_similarity(file_name):
    # Retrieve the embedding for the given file name
    target_embedding = final_df.loc[final_df['file'] == file_name, 'embedding'].values[0]
    
    # Convert embeddings to a 2D array
    embeddings = np.vstack(final_df['embedding'].values)

    # Calculate cosine distances
    distances = pairwise_distances([target_embedding], embeddings, metric='cosine')[0]
    
    # Convert distances to similarities
    similarities = 1 - distances
    
    # Add similarities to the DataFrame
    final_df['cosine_similarity'] = similarities
    
    return final_df[['file', 'title', 'cosine_similarity']]

# Example usage
file_name = 'SDR_Hobby_v4op.mp4'
similarity_df = calculate_cosine_similarity(file_name)
similarity_df.sort_values(by='cosine_similarity', ascending=False).head()

Unnamed: 0,file,title,cosine_similarity
2,SDR_Hobby_v4op.mp4,Serene River Flowing Through a Mountain Gorge,1.0
43,SDR_Hobby_rdby.mp4,Fishing Perch,0.637745
19,SDR_Animal_svq5.mp4,Orange and White Kitten Exploring Grass,0.616769
15,SDR_Hobby_rl1n.mp4,Northern Pike Caught on Lure,0.57845
27,SDR_Health_1p77.mp4,Nature's Consequences,0.573209


In [22]:
final_df.to_csv('final_df.csv', index=False)