#Import necessary libraries

In [1]:
import numpy as np
import pandas as pd

#Import the datasets

In [2]:
df=pd.read_csv('Story_Synthetic_dataset.csv')

In [3]:
df.head()

Unnamed: 0,story_id,user_id,rating,publication_year,title,timestamp
0,1,189,3,2019,aarjavajeevulu,2025
1,1,177,1,2019,aarjavajeevulu,2019
2,1,98,5,2019,aarjavajeevulu,2024
3,1,155,1,2019,aarjavajeevulu,2025
4,1,187,1,2019,aarjavajeevulu,2023


# Popularity Based Recommender System

##recommendation of books based on the number of ratings and average rating. The basic idea is that books with more ratings or higher average ratings are recommended to users.

###Finding number of ratings for each book

In [4]:
num_rating_df = df.groupby('story_id').count()['rating'].reset_index()
num_rating_df.rename(columns={'rating':'num_ratings'},inplace=True)
num_rating_df

Unnamed: 0,story_id,num_ratings
0,1,130
1,2,146
2,3,106
3,4,148
4,5,60
5,6,148
6,7,68
7,8,140
8,9,78
9,10,96


###Finding average rating for each book

In [None]:
avg_rating_df = df.groupby('story_id')['rating'].mean().reset_index()
avg_rating_df.rename(columns={'rating':'avg_rating'},inplace=True)
avg_rating_df

Unnamed: 0,story_id,avg_rating
0,1,3.038462
1,2,3.116438
2,3,3.04717
3,4,3.047297
4,5,2.633333
5,6,3.054054
6,7,3.088235
7,8,3.2
8,9,3.038462
9,10,3.0


###Popularity based on average rating

In [None]:
popular_df = num_rating_df.merge(avg_rating_df,on='story_id')
popular_df.head()

Unnamed: 0,story_id,num_ratings,avg_rating
0,1,130,3.038462
1,2,146,3.116438
2,3,106,3.04717
3,4,148,3.047297
4,5,60,2.633333


In [None]:
# Sort by number of ratings and average rating (both descending)
top_5_books = popular_df.sort_values(['num_ratings', 'avg_rating'], ascending=False).head(5)

# Display the top 10 popular books
print(top_5_books)



    story_id  num_ratings  avg_rating
13        14          172    3.180233
10        11          153    3.189542
11        12          152    3.190789
5          6          148    3.054054
3          4          148    3.047297


# Collaborative Filtering Based Recommender System

###Filter Active Users – Select users who have rated at least 5 books.

In [None]:
x = df.groupby('user_id').count()['rating'] >= 5
qualified_users = x[x].index

###Filter Popular Books – Keep books with at least 5 ratings.

Refine Ratings Dataset – Include only ratings from selected users for popular books.

In [None]:
filtered_rating = df[df['user_id'].isin(qualified_users)]

In [None]:
y = filtered_rating.groupby('story_id').count()['rating']>=5
famous_books = y[y].index

In [None]:
final_ratings = filtered_rating[filtered_rating['story_id'].isin(famous_books)]

In [None]:
final_ratings

Unnamed: 0,story_id,user_id,rating,publication_year,title,timestamp
0,1,189,3,2019,aarjavajeevulu,2025
1,1,177,1,2019,aarjavajeevulu,2019
2,1,98,5,2019,aarjavajeevulu,2024
3,1,155,1,2019,aarjavajeevulu,2025
4,1,187,1,2019,aarjavajeevulu,2023
...,...,...,...,...,...,...
1726,14,184,5,2021,svarajyam,2024
1727,14,37,5,2021,svarajyam,2023
1728,14,3,4,2021,svarajyam,2024
1729,14,178,5,2021,svarajyam,2024


###Create User-Item Matrix – Convert data into a pivot table where:Rows → Book_ID's,Columns → User IDs,Values → Ratings

In [None]:
pt = final_ratings.pivot_table(index='story_id',columns='user_id',values='rating')

In [None]:
pt.fillna(0,inplace=True)
pt

user_id,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
story_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,2.0,5.0,3.0,0.0,5.0,4.0,3.0,2.0,0.0,...,0.0,4.0,4.0,4.0,0.0,4.0,3.0,0.0,3.0,2.0
2,3.0,3.0,3.0,0.0,4.0,5.0,4.0,1.0,0.0,0.0,...,2.0,2.0,3.0,5.0,3.0,2.0,3.0,5.0,1.0,2.0
3,4.0,5.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,4.0,2.0,0.0,2.0,0.0,0.0,4.0,0.0,4.0,0.0
4,3.0,3.0,1.0,2.0,4.0,3.0,0.0,0.0,3.0,4.0,...,4.0,0.0,3.0,2.0,5.0,4.0,3.0,3.0,0.0,3.0
5,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,3.0,2.0,0.0,3.0,0.0,3.0,0.0,0.0,4.0,0.0
6,3.0,0.0,1.0,0.0,2.0,1.0,3.0,0.0,1.0,2.0,...,4.0,0.0,4.0,0.0,4.0,3.0,3.0,0.0,3.0,3.0
7,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,...,5.0,0.0,0.0,2.0,0.0,3.0,3.0,3.0,0.0,4.0
8,0.0,0.0,3.0,0.0,0.0,4.0,3.0,1.0,5.0,3.0,...,0.0,3.0,5.0,4.0,0.0,5.0,3.0,3.0,0.0,3.0
9,0.0,2.0,0.0,3.0,4.0,2.0,0.0,0.0,0.0,5.0,...,0.0,0.0,4.0,0.0,4.0,0.0,3.0,0.0,0.0,4.0
10,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,...,0.0,0.0,0.0,4.0,3.0,3.0,2.0,2.0,0.0,4.0


#Compute Similarity Scores – Use cosine similarity on the user-item matrix to find book similarities.
###Find Similar Books – Sort books based on similarity scores and pick the top recommendations.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(pt)
similarity_scores.shape

(14, 14)

In [None]:
def recommend(story_id):
    #Find the index of the story in the pivot table
    if story_id not in pt.index:
        return f"story ID {story_id} not found in the filtered data."

    index = pt.index.get_loc(story_id)

    # Get top 2 most similar stories
    similar_items = sorted(
        list(enumerate(similarity_scores[index])),
        key=lambda x: x[1],
        reverse=True
    )[1:3]  # skip the input story

    data = []
    for i in similar_items:
        similar_story_id = pt.index[i[0]]
        temp_df = df[df['story_id'] == similar_story_id].drop_duplicates('story_id')

        if not temp_df.empty:
            item = [
                temp_df['title'].values[0]
            ]
            data.append(item)

    return data

In [None]:
recommend(2)

[['mantrapu'], ['svarajyam']]

#Time based sorting

In [None]:
from datetime import datetime

# Merge publication_year into popular_df from the original df
publication_year_df = df[['story_id', 'publication_year']].drop_duplicates()
popular_df = popular_df.merge(publication_year_df, on='story_id', how='left')

#Calculate years since publication
current_year = datetime.now().year
popular_df['years_since_publication'] = current_year - popular_df['publication_year']
popular_df['years_since_publication'] = popular_df['years_since_publication'].replace(0, 1)

#Calculate time-based popularity
popular_df['time_based_popularity'] = popular_df['num_ratings'] / popular_df['years_since_publication']

#Sort by time-based popularity
sorted_books = popular_df.sort_values(by='time_based_popularity', ascending=False)

#Display top 10 popular stories
print(sorted_books[['story_id', 'num_ratings', 'years_since_publication', 'time_based_popularity']].head(10))


    story_id  num_ratings  years_since_publication  time_based_popularity
10        11          153                        1             153.000000
5          6          148                        3              49.333333
13        14          172                        4              43.000000
12        13          134                        4              33.500000
0          1          130                        6              21.666667
1          2          146                        8              18.250000
6          7           68                        4              17.000000
11        12          152                       13              11.692308
7          8          140                       17               8.235294
9         10           96                       12               8.000000


#Time period based recommendation


In [None]:
import pandas as pd


df['rating_year'] = df['timestamp']

#Compute yearly trends per story
yearly_trends = df.groupby(['rating_year', 'story_id']).agg(
    rating_count=('rating', 'count'),
    avg_rating=('rating', 'mean')
).reset_index()

#Merge with story titles and publication year
story_info = df[['story_id', 'title', 'publication_year']].drop_duplicates()
yearly_trends = yearly_trends.merge(story_info, on='story_id', how='left')

#Function to get trending stories in a year range
def get_trending_stories(start_year, end_year):
    filtered = yearly_trends[
        (yearly_trends['rating_year'] >= start_year) &
        (yearly_trends['rating_year'] <= end_year)
    ]

    # Aggregate total ratings and average ratings
    popular_stories = filtered.groupby(['story_id', 'title', 'publication_year']).agg(
        total_ratings=('rating_count', 'sum'),
        overall_avg_rating=('avg_rating', 'mean')
    ).reset_index()

    # Sort: most rated, then best average rating
    popular_stories = popular_stories.sort_values(
        by=['total_ratings', 'overall_avg_rating'], ascending=[False, False]
    )

    return popular_stories

# # Example: Get top stories from 2019 to 2022
# start_year = 2019
# end_year = 2022
# popular_stories = get_trending_stories(start_year, end_year)

# # 6. Show top 10
# print(f"Top Trending Stories from {start_year} to {end_year}:\n")
# print(popular_stories.head(5))


Top Trending Stories from 2019 to 2022:

    story_id                   title  publication_year  total_ratings  \
1          2  abhaayaranyam lo amber              2017             70   
0          1          aarjavajeevulu              2019             70   
12        14               svarajyam              2021             68   
11        13       manchi snehithulu              2021             57   
10        12              pantikinda              2012             43   

    overall_avg_rating  
1             3.122917  
0             3.059707  
12            3.300877  
11            2.864764  
10            3.002451  


#Context-aware recommendation.

##Using Calenderific API to fetch relevant stories based on occasions in a particular month of an year

In [None]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.0


In [None]:
def list_extracted_files(extract_path):
    files = os.listdir(extract_path)
    print(f"[INFO] Extracted Files: {files}")

In [None]:
import zipfile
import os
from datetime import datetime
import requests
import unicodedata
from sentence_transformers import SentenceTransformer, util

#Model Loading
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

#Normalize text
def normalize_text(text):
    text = text.lower().strip()
    return unicodedata.normalize("NFC", text)

#Occasion Keywords
occasion_keywords = {
    "Diwali": [
        "దీపాలు", "పండుగ", "అలంకారాలు", "తీపులు", "కుటుంబం", "ఆనందం",
        "దీపాల వరుస", "పటాకులు", "లడ్డూ", "నూతన దుస్తులు", "కృష్ణుడు",
        "లాంపులు", "దీపోత్సవం", "పూజ", "కాంతులు"
    ],
    "Children's Day": [
        "పిల్లలు", "బాల్యం", "ఆటలు", "పాఠశాల", "సంతోషం", "శిక్షణ",
        "జవహర్‌లాల్ నెహ్రూ", "టీచర్లు", "ఉత్సవాలు", "పోటీలు", "బహుమతులు",
        "నాటికలు", "కథలు", "బొమ్మలు", "కలల ప్రపంచం"
    ],
    "Christmas": [
        "క్రిస్మస్", "బహుమతులు", "చిన్నారి", "చర్చి", "సంతోషం", "తల్లి తండ్రులు",
        "క్రిస్మస్ చెట్టు", "సాంటా క్లాజ్", "జింగిల్ బెల్స్", "క్రిస్మస్ పాటలు",
        "గుడ్లు", "క్రిస్మస్ స్టార్", "క్రిస్మస్ పిండి వంటలు", "ప్రార్థనలు", "మౌలికత్వం"
    ],
    "Dussehra": [
        "దసరా", "విజయదశమి", "రావణ దహనం", "అయోధ్య", "రాముడు", "సీత",
        "హనుమాన్", "రామాయణం", "పూజ", "బొమ్మల కోలువు", "శక్తి పూజ",
        "దుర్గమ్మ", "ఆలయం", "నవరాత్రులు", "ఆనందం"
    ],
    "Independence Day": [
        "స్వాతంత్ర్య దినోత్సవం", "జెండా", "పతాకావందనం", "భారతదేశం", "జవాన్లు",
        "ఆజాదీ", "గణతంత్రం", "ప్రముఖ నాయకులు", "సభలు", "రాష్ట్ర గీతం",
        "పరేడ్", "పరాక్రమం", "దేశభక్తి", "పాత్రత", "మంచి పౌరుడు"
    ],
    "Republic Day": [
        "గణతంత్ర దినోత్సవం", "భారత రాజ్యాంగం", "డాక్టర్ అంబేద్కర్", "జెండా ఊపడం",
        "రాజ్ పథ్ పరేడ్", "సైనిక ప్రదర్శన", "జాతీయ గీతం", "త్రివర్ణ పతాకం",
        "పతాకావందనం", "సాంస్కృతిక ప్రదర్శనలు", "ప్రముఖ అతిథులు", "భవిష్యత్ భావనలు",
        "దేశభక్తి పాటలు", "ప్రమాణ స్వీకారం", "భారతీయత"
    ],
    "Friendship Day": [
        "మిత్రత్వం", "స్నేహితులు", "స్నేహం", "పండుగ", "బంధం", "ఆనందం",
        "బహుమతులు", "స్నేహపత్రికలు", "జ్ఞాపకాలు", "సంబంధాలు", "ఆప్యాయత",
        "స్నేహసూక్తులు", "పాటలు", "పిక్నిక్", "సెల్ఫీలు",
        "సహచరులు", "పరిచయాలు", "మాటలు", "సమ్మేళనం", "ఆరాధనలు"
    ],
    "Raksha Bandhan": [
        "రాఖీ", "బంధం", "సోదరుడు", "సోదరి", "సంకల్పం", "సురక్షితుడు",
        "బంధు", "పండుగ", "సహాయం", "ఆప్యాయత", "స్నేహం",
        "సంధి", "బంధువు", "కటాక్షం", "పరిశుభ్రత", "బంధం ప్రతిజ్ఞ",
        "తల్లి", "దుప్పటి", "పండుగ వాతావరణం", "ఆచారాలు"
    ]
}

occasion_keywords = {
    normalize_text(k): [normalize_text(w) for w in v] for k, v in occasion_keywords.items()
}

#Extract ZIP of Text Files
def extract_zip(zip_path, extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)


# Read Text Files and Load Stories
def load_stories_from_text_files(folder_path):
    stories = []
    for foldername, subfolders, filenames in os.walk(folder_path):  # Traverse all subfolders
        for filename in filenames:
            if filename.endswith(".txt"):
                file_path = os.path.join(foldername, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        content = file.read()
                        if content.strip():
                        #     print(f"[DEBUG] Content of {filename}: {content[:200]}...")  # Preview first 200 characters
                            stories.append({
                                "title": filename.replace(".txt", ""),
                                "content": content.strip()
                            })
                except Exception as e:
                    print(f"[ERROR] Failed to read {filename}: {e}")
    return stories


#Fetch Holidays using Calendarific API
def get_festivals_from_calendarific(api_key, country="IN", month=None, year=None):
    if not month:
        month = datetime.now().month
    if not year:
        year = datetime.now().year

    try:
        res = requests.get("https://calendarific.com/api/v2/holidays", params={
            "api_key": api_key,
            "country": country,
            "year": year,
            "month": month
        })
        res.raise_for_status()
        holidays = res.json()["response"]["holidays"]
        return [normalize_text(h["name"]) for h in holidays]
    except Exception as e:
        print(f"[ERROR] Failed to fetch holidays: {e}")
        return []

#Recommend Stories
def recommend_stories_with_api(stories, occasion_keywords, api_key, user_selected_occasion=None, month=None, year=None, top_n=2):
    #Determine Occasion
    occasion = None
    if user_selected_occasion:
        occasion = normalize_text(user_selected_occasion)
    else:
        fetched_festivals = get_festivals_from_calendarific(api_key, month=month, year=year)
        print("[INFO] Fetched Festivals from API:", fetched_festivals)
        for fest in fetched_festivals:
            if fest in occasion_keywords:
                occasion = fest
                break

    if not occasion:
        print("[INFO] No matching occasion found.")
        return []

    if occasion not in occasion_keywords:
        print(f"[WARNING] No keywords defined for occasion: {occasion}")
        return []

    #Embed keywords
    keywords_text = " ".join(occasion_keywords[occasion])
    keywords_embedding = model.encode(keywords_text, convert_to_tensor=True)

    #Score stories
    story_scores = []
    for story in stories:
        story_embedding = model.encode(story["content"], convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(story_embedding, keywords_embedding).item()
        story_scores.append((story["title"], similarity))

    #Display results
    sorted_stories = sorted(story_scores, key=lambda x: x[1], reverse=True)
    top_stories = sorted_stories[:top_n]

    print(f"\n[INFO] Occasion detected: {occasion.title()}")
    print(f"Top {top_n} relevant stories:\n")
    for i, (title, score) in enumerate(top_stories, start=1):
        print(f"{i}. {title} (Similarity: {score:.2f})")

    return top_stories

# === Main Execution ===

#Define paths
zip_path = "/content/Telugu_stories_text_files (2).zip"
extract_path = "/content/telugu_stories_extracted"

def list_extracted_files(extract_path):
    files = os.listdir(extract_path)
    print(f"[INFO] Extracted Files: {files}")

#Extract ZIP
extract_zip(zip_path, extract_path)
list_extracted_files(extract_path)

#Load stories from Text Files
stories = load_stories_from_text_files(extract_path)
print(f"[INFO] Loaded {len(stories)} stories.")

#Recommend based on detected or selected occasion
api_key = "dP1ErMp4DJVNNUdOC1yFo8Y8lG31dWKl"
recommend_stories_with_api(stories, occasion_keywords, api_key, user_selected_occasion=None, month=8)


[INFO] Extracted files to: /content/telugu_stories_extracted
[INFO] Extracted Files: ['Telugu_stories_text_files']
[INFO] Loaded 13 stories.
[INFO] Fetched Festivals from API: ['friendship day', 'raksha bandhan (rakhi)', 'independence day', 'janmashtami (smarta)', 'parsi new year', 'janmashtami', 'ganesh chaturthi/vinayaka chaturthi']

[INFO] Occasion detected: Independence Day
Top 2 relevant stories:

1. bhutaddalu (Similarity: 0.92)
2. dantOdantaM (Similarity: 0.92)


[('bhutaddalu', 0.9242951273918152), ('dantOdantaM', 0.9239039421081543)]

In [None]:
# === Import Required Libraries ===
import zipfile
import os
from datetime import datetime
import requests
import re
import unicodedata
from sentence_transformers import SentenceTransformer, util

#Load Pretrained Sentence Transformer Model
# This model generates multilingual semantic embeddings, useful for finding story-occasion similarity
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

#Normalize Text Function
# Helps ensure uniform comparison by normalizing characters and removing case/space differences


def normalize_text(text):
    text = text.lower().strip()
    text = unicodedata.normalize("NFC", text)
    # Remove apostrophes and punctuations for better matching
    text = re.sub(r"[’'`]", "", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

occasion_keywords = {
    "Diwali": [
        "దీపాలు", "పండుగ", "అలంకారాలు", "తీపులు", "కుటుంబం", "ఆనందం",
        "దీపాల వరుస", "పటాకులు", "లడ్డూ", "నూతన దుస్తులు", "కృష్ణుడు",
        "లాంపులు", "దీపోత్సవం", "పూజ", "కాంతులు"
    ],
    "Children's Day": [
        "పిల్లలు", "బాల్యం", "ఆటలు", "పాఠశాల", "సంతోషం", "శిక్షణ",
        "జవహర్‌లాల్ నెహ్రూ", "టీచర్లు", "ఉత్సవాలు", "పోటీలు", "బహుమతులు",
        "నాటికలు", "కథలు", "బొమ్మలు", "కలల ప్రపంచం"
    ],
    "Christmas": [
        "క్రిస్మస్", "బహుమతులు", "చిన్నారి", "చర్చి", "సంతోషం", "తల్లి తండ్రులు",
        "క్రిస్మస్ చెట్టు", "సాంటా క్లాజ్", "జింగిల్ బెల్స్", "క్రిస్మస్ పాటలు",
        "గుడ్లు", "క్రిస్మస్ స్టార్", "క్రిస్మస్ పిండి వంటలు", "ప్రార్థనలు", "మౌలికత్వం"
    ],
    "Dussehra": [
        "దసరా", "విజయదశమి", "రావణ దహనం", "అయోధ్య", "రాముడు", "సీత",
        "హనుమాన్", "రామాయణం", "పూజ", "బొమ్మల కోలువు", "శక్తి పూజ",
        "దుర్గమ్మ", "ఆలయం", "నవరాత్రులు", "ఆనందం"
    ],
    "Independence Day": [
        "స్వాతంత్ర్య దినోత్సవం", "జెండా", "పతాకావందనం", "భారతదేశం", "జవాన్లు",
        "ఆజాదీ", "గణతంత్రం", "ప్రముఖ నాయకులు", "సభలు", "రాష్ట్ర గీతం",
        "పరేడ్", "పరాక్రమం", "దేశభక్తి", "పాత్రత", "మంచి పౌరుడు"
    ],
    "Republic Day": [
        "గణతంత్ర దినోత్సవం", "భారత రాజ్యాంగం", "డాక్టర్ అంబేద్కర్", "జెండా ఊపడం",
        "రాజ్ పథ్ పరేడ్", "సైనిక ప్రదర్శన", "జాతీయ గీతం", "త్రివర్ణ పతాకం",
        "పతాకావందనం", "సాంస్కృతిక ప్రదర్శనలు", "ప్రముఖ అతిథులు", "భవిష్యత్ భావనలు",
        "దేశభక్తి పాటలు", "ప్రమాణ స్వీకారం", "భారతీయత"
    ],
    "Friendship Day": [
        "మిత్రత్వం", "స్నేహితులు", "స్నేహం", "పండుగ", "బంధం", "ఆనందం",
        "బహుమతులు", "స్నేహపత్రికలు", "జ్ఞాపకాలు", "సంబంధాలు", "ఆప్యాయత",
        "స్నేహసూక్తులు", "పాటలు", "పిక్నిక్", "సెల్ఫీలు",
        "సహచరులు", "పరిచయాలు", "మాటలు", "సమ్మేళనం", "ఆరాధనలు"
    ],
    "Raksha Bandhan": [
        "రాఖీ", "బంధం", "సోదరుడు", "సోదరి", "సంకల్పం", "సురక్షితుడు",
        "బంధు", "పండుగ", "సహాయం", "ఆప్యాయత", "స్నేహం",
        "సంధి", "బంధువు", "కటాక్షం", "పరిశుభ్రత", "బంధం ప్రతిజ్ఞ",
        "తల్లి", "దుప్పటి", "పండుగ వాతావరణం", "ఆచారాలు"
    ],
    "Mother's Day": [
        "తల్లి", "మాతృదినోత్సవం", "ప్రేమ", "ఆశీస్సులు", "పూజ",
        "కృతజ్ఞత", "బహుమతులు", "ఆప్యాయత", "కుటుంబం", "సంరక్షణ",
        "సేవ", "సంకల్పం", "సమ్మానం", "పుష్పాలు", "హృదయం",
        "ఆనందం", "వందనం", "స్పర్శ", "స్నేహం", "కథలు"
    ],
    "Father's Day": [
        "తండ్రి", "పితృదినోత్సవం", "బలము", "ప్రేరణ", "సేవ",
        "ఆప్యాయత", "బహుమతులు", "కుటుంబం", "సమయము", "కృషి",
        "గౌరవం", "ఆదర్శం", "కృతజ్ఞత", "రక్షణ", "విశ్వాసం",
        "సమ్మానం", "పాటలు", "కథలు", "ఆనందం", "భక్తి"
    ]
}


# Normalize all keys and values in the keywords dictionary for consistency
occasion_keywords = {
    normalize_text(k): [normalize_text(w) for w in v] for k, v in occasion_keywords.items()
}

#Extract ZIP File Containing Story Texts
def extract_zip(zip_path, extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

#=Load and Read Stories from Extracted Files
def load_stories_from_text_files(folder_path):
    stories = []
    for foldername, subfolders, filenames in os.walk(folder_path):  # Traverse through all folders
        for filename in filenames:
            if filename.endswith(".txt"):  # Only process text files
                file_path = os.path.join(foldername, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        content = file.read()
                        if content.strip():  # Only add non-empty stories
                            stories.append({
                                "title": filename.replace(".txt", ""),
                                "content": content.strip()
                            })
                except Exception as e:
                    print(f"[ERROR] Failed to read {filename}: {e}")
    return stories

#Fetch Festival List from Calendarific API
def get_festivals_from_calendarific(api_key, country="IN", month=None, year=None):
    if not month:
        month = datetime.now().month
    if not year:
        year = datetime.now().year

    try:
        res = requests.get("https://calendarific.com/api/v2/holidays", params={
            "api_key": api_key,
            "country": country,
            "year": year,
            "month": month
        })
        res.raise_for_status()
        holidays = res.json()["response"]["holidays"]
        return [normalize_text(h["name"]) for h in holidays]
    except Exception as e:
        print(f"[ERROR] Failed to fetch holidays: {e}")
        return []

#Match Stories with Occasion Based on Semantic Similarity
def recommend_stories_with_api(stories, occasion_keywords, api_key, user_selected_occasion=None, month=None, year=None, top_n=2):
    #Determine Occasion
    occasion = None
    if user_selected_occasion:
        occasion = normalize_text(user_selected_occasion)
    else:
        # Automatically detect current festival from API
        fetched_festivals = get_festivals_from_calendarific(api_key, month=month, year=year)
        print("[INFO] Fetched Festivals from API:", fetched_festivals)
        for fest in fetched_festivals:
            if fest in occasion_keywords:
                occasion = fest
                break

    if not occasion:
        print("[INFO] No matching occasion found.")
        return []

    if occasion not in occasion_keywords:
        print(f"[WARNING] No keywords defined for occasion: {occasion}")
        return []

    #Create Embedding for Occasion Keywords
    keywords_text = " ".join(occasion_keywords[occasion])
    keywords_embedding = model.encode(keywords_text, convert_to_tensor=True)

    #Compare Each Story's Embedding with Occasion Keywords
    story_scores = []
    for story in stories:
        story_embedding = model.encode(story["content"], convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(story_embedding, keywords_embedding).item()
        story_scores.append((story["title"], similarity))

    #Sort by Similarity and Return Top Matches
    sorted_stories = sorted(story_scores, key=lambda x: x[1], reverse=True)
    top_stories = sorted_stories[:top_n]

    print(f"\n[INFO] Occasion detected: {occasion.title()}")
    print(f"Top {top_n} relevant stories:\n")
    for i, (title, score) in enumerate(top_stories, start=1):
        print(f"{i}. {title} (Similarity: {score:.2f})")

    return top_stories

# Main Execution Logic

# 1. Define paths for input ZIP and extracted folder
zip_path = "/content/Telugu_stories_text_files (2).zip"
extract_path = "/content/telugu_stories_extracted"

# Helper to view extracted file names
def list_extracted_files(extract_path):
    files = os.listdir(extract_path)
    print(f"[INFO] Extracted Files: {files}")

# 2. Extract ZIP file containing .txt story files
extract_zip(zip_path, extract_path)
list_extracted_files(extract_path)

# 3. Load stories from extracted .txt files
stories = load_stories_from_text_files(extract_path)
print(f"[INFO] Loaded {len(stories)} stories.")

# 4. Recommend stories based on detected festival (or user-provided)
api_key = "dP1ErMp4DJVNNUdOC1yFo8Y8lG31dWKl"  # Calendarific API key
recommend_stories_with_api(
    stories,
    occasion_keywords,
    api_key,
    user_selected_occasion=None,  # Set to something like "Christmas" to manually specify
    month=None  # Specify month (6 for example)
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[INFO] Extracted Files: ['Telugu_stories_text_files']
[INFO] Loaded 13 stories.
[INFO] Fetched Festivals from API: ['bakrid', 'fathers day', 'june solstice', 'rath yatra']

[INFO] Occasion detected: Fathers Day
Top 2 relevant stories:

1. illu.html (Similarity: 0.45)
2. aarjavajeevulu (Similarity: 0.43)


[('illu.html', 0.4479082524776459), ('aarjavajeevulu', 0.4326225519180298)]