In [1]:
import sys
sys.path.append('/Users/pablojerezarnau/git/RS-backend/')

import json
from config.settings import TOPIC_MODELING_RUNS_DIR
import os
from collections import OrderedDict

def generate_topic_tokens_dict(file_path):
    """
    Generate a dictionary with topic_id as keys and list of tokens as values,
    sorted by topic_id.

    :param file_path: Path to the topics.json file.
    :return: Dictionary with topic_id as keys and list of tokens as values, sorted by topic_id.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        topics_data = json.load(file)

    topic_tokens_dict = {}
    for topic in topics_data:
        # Flatten the list of token dictionaries into a single list of token names
        tokens_list = [token for tokens_dict in topic['tokens'] for token, score in tokens_dict.items()]
        topic_tokens_dict[topic['topic_number']] = tokens_list

    # Sort the dictionary by topic_number and return an OrderedDict
    sorted_topic_tokens_dict = OrderedDict(sorted(topic_tokens_dict.items()))

    return sorted_topic_tokens_dict

In [2]:
# Example usage
run_id = '20240319_153921_dataset3_topics_300'
run_dir = os.path.join(TOPIC_MODELING_RUNS_DIR, run_id)
file_path = os.path.join(run_dir, 'topics.json')

topic_tokens_dict = generate_topic_tokens_dict(file_path)

# Print the sorted dictionary
for topic_id, tokens in topic_tokens_dict.items():
    print(f"Topic {topic_id}: {tokens}")

Topic 0: ['girl', 'female', 'swimming', 'russian', 'diving', 'boy', 'stretch', 'dress', 'pool', 'woman']
Topic 1: ['fail', 'funny', 'compilation', 'instant', 'stupid', 'well', 'epic', 'afv', 'funniest', 'zelda']
Topic 2: ['technology', 'tech', 'blockchain', 'gadget', 'future', 'late', 'computer', 'innovation', 'technical', 'internet']
Topic 3: ['asmr', 'cooking', 'satisfy', 'sound', 'eat', 'mukbang', 'camping', 'relax', 'cook', 'sleep']
Topic 4: ['energy', 'renewable', 'power', 'solar', 'wind', 'country', 'source', 'green', 'clean', 'sustainable']
Topic 5: ['get', 'talent', 'surfer', 'wave', 'agt', 'america', 'watch', 'subway', 'barrel', 'audition']
Topic 6: ['furniture', 'smart', 'save', 'space', 'bedroom', 'door', 'karachi', 'small', 'market', 'idea']
Topic 7: ['interior', 'design', 'house', 'room', 'bedroom', 'home', 'modern', 'decor', 'living', 'designer']
Topic 8: ['real', 'estate', 'wholesale', 'not', 'feel', 'ghost', 'free', 'invest', 'property', 'deal']
Topic 9: ['gym', 'fitnes

In [3]:
topic_descriptions = {
    0: "Swimming Diversity",
    1: "Funny Fail Compilation",
    2: "Tech and Blockchain",
    3: "ASMR Cooking and Sounds",
    4: "Renewable Energy",
    5: "Talent Surfing",
    6: "Smart Furniture",
    7: "Interior Design",
    8: "Real Estate Insights",
    9: "Gym and Fitness",
    10: "English Speaking Practice",
    11: "Digital Nomad Services",
    12: "Adorable Baby Photos",
    13: "Market Predictions",
    14: "Mountain Cycling Stunts",
    15: "Career Success Strategies",
    16: "Fruit Cutting Tips",
    17: "Budget Travel Guides",
    18: "Blender Animation",
    19: "Fishing Techniques",
    20: "Athletic Moments",
    21: "World Conservation",
    22: "Ethical Hacking",
    23: "DIY Candle Making",
    24: "Adventure Activities",
    25: "Gardening Tips",
    26: "Smart Home Gadgets",
    27: "Onion Pakoda Recipe",
    28: "Fat Reduction Exercises",
    29: "Effective Fishing",
    30: "Home Science Experiments",
    31: "Woolen Flower Design",
    32: "Stock Market Trends",
    33: "Balloon Decoration Ideas",
    34: "Naruto Anime Clash",
    35: "Beginner Programming",
    36: "TikTok Challenges",
    37: "Hindi Film Summaries",
    38: "Simple Hairstyles",
    39: "Social Media Marketing",
    40: "Hit Music and Relaxation",
    41: "Programming Tutorials",
    42: "Podcast Interviews",
    43: "Taekwondo Kick Tutorial",
    44: "National Day Celebrations",
    45: "DIY Wall Decor",
    46: "Awareness Campaigns",
    47: "Jewelry Making",
    48: "Gardening Tips",
    49: "Photo Editing Tips",
    50: "Reel Tweets",
    51: "Animation Effects",
    52: "TV Watch",
    53: "Robotic Links",
    54: "Farm Tractor",
    55: "Welding Skills",
    56: "Fashion Haul",
    57: "Urdu Calligraphy",
    58: "Bike Sports",
    59: "Quran Translation",
    60: "Electric Vehicles",
    61: "Motivation Views",
    62: "Music Beats",
    63: "Funny Pranks",
    64: "Mobile Gaming",
    65: "Creative Ideas",
    66: "Free Fire",
    67: "Khan Trailer",
    68: "Graphic Design",
    69: "Sofa Bed",
    70: "Comedy Standup",
    71: "Hair Style",
    72: "Easy Drawing",
    73: "Bird Sounds",
    74: "Chess Tricks",
    75: "Price Patterns",
    76: "Software Engineer",
    77: "TikTok Trends",
    78: "Piano Music",
    79: "Financial Advice",
    80: "WhatsApp Status",
    81: "Urdu Shayari",
    82: "Physics Class",
    83: "Android Games",
    84: "Piano Tutorial",
    85: "Paint Techniques",
    86: "Web Animation",
    87: "Coding Boost",
    88: "Flower Arrangement",
    89: "Crypto News",
    90: "School Time",
    91: "Health Wellness",
    92: "Easy Crafts",
    93: "Numerology Insights",
    94: "Learning Machines",
    95: "Peppa Pig",
    96: "Recycle Bottle",
    97: "Event Planning",
    98: "Cheese Pizza",
    99: "Programming Basics",
    100: 'Trading Strategy',
    101: 'Indian Drive',
    102: 'Egg Challenge',
    103: 'Review Opinion',
    104: 'Gaming Highlight',
    105: 'Diwali Dress',
    106: 'DJ Setup',
    107: 'Engine Start',
    108: 'Makeup Look',
    109: 'Study Motivation',
    110: 'Movie Explain',
    111: 'Embroidery Stitch',
    112: 'Sport Bike',
    113: 'TikTok Compilation',
    114: 'Puppy Training',
    115: 'Ariana Vocal',
    116: 'Design Mehndi',
    117: 'Tattoo Design',
    118: 'Funny Meme',
    119: 'Chess Puzzle',
    120: 'Samsung Galaxy',
    121: 'Book Recommendation',
    122: 'Hindi Movie',
    123: 'Pose Stylish',
    124: 'Mobile Trick',
    125: 'Paneer Recipe',
    126: 'Hacker Attitude',
    127: 'Truck Driver',
    128: 'Jupiter Moon',
    129: 'Craft Idea',
    130: 'Attitude Status',
    131: 'Motivational Speech',
    132: 'Tamil Voice',
    133: 'Movie Recap',
    134: 'Luxury Dubai',
    135: 'Fashion Style',
    136: 'India Sportbike',
    137: 'Dance Performance',
    138: 'Roblox Friend',
    139: 'Thriller Series',
    140: 'Writing Class',
    141: 'Content Tool',
    142: 'Market Prediction',
    143: 'Bow Archery',
    144: 'Love Letter',
    145: 'Engagement Ring',
    146: 'Note Death',
    147: 'Live Today',
    148: 'Piano Sheet',
    149: 'Moon View',
    150: 'Psychology Insight',
    151: 'Reddit Reaction',
    152: 'Online Income',
    153: 'Drone Camera',
    154: 'Water Conservation',
    155: 'Workout Routine',
    156: 'Apple Device',
    157: 'UI/UX Design',
    158: 'Horror Story',
    159: 'Piano Believer',
    160: 'Free Fire Event',
    161: 'Coffee Brewing',
    162: 'Web Development',
    163: 'Virtual Reality',
    164: 'Tree Grafting',
    165: 'Artistic Expression',
    166: 'Cartoon Fun',
    167: 'Weight Loss Plan',
    168: 'Stock Market Analysis',
    169: 'Sad Reality',
    170: 'Solar Power',
    171: 'Medical Motivation',
    172: 'Cybersecurity Career',
    173: 'Primate Adventure',
    174: 'Nikon Photography',
    175: 'Sewing Project',
    176: 'SEO Strategy',
    177: 'Versus Comparison',
    178: 'Mobile Photography',
    179: 'Skincare Effect',
    180: 'Python Basics',
    181: 'Dog Training',
    182: 'Glass Blowing',
    183: 'Pet Care Simulator',
    184: 'Ludo Challenge',
    185: 'Fortnite Update',
    186: 'FNAF Adventure',
    187: 'Superhero Gadgets',
    188: 'Cursive Writing',
    189: 'Thai Journey',
    190: 'Respect Moment',
    191: 'Fashion Pose',
    192: 'Digital Marketing',
    193: 'Indoor Gardening',
    194: 'Climate Change',
    195: 'True Storytelling',
    196: 'Rap Instrumental',
    197: 'VR Funny Moment',
    198: 'RC Car Stunt',
    199: 'Saree Shopping',
    200: 'Gorilla Tag Update',
    201: 'Toyota Land Review',
    202: 'Kitchen Gadget',
    203: 'Funny Moments',
    204: 'Fragrance Gift',
    205: 'Luxury Resort',
    206: 'Samsung Drama',
    207: 'Fast Weight Loss',
    208: 'Big Wave Surfing',
    209: 'Morning Meditation',
    210: 'Funny Fitness',
    211: 'Animal Rescue',
    212: 'Science Fiction',
    213: 'Small Business',
    214: 'AI Future',
    215: 'Chennai Street',
    216: 'Forensic Analysis',
    217: 'Fortnite Settings',
    218: 'Copyright Disclaimer',
    219: 'Subway Slam',
    220: 'Logo Design',
    221: 'Urdu Poetry',
    222: 'Team Building',
    223: 'Designer Blouse',
    224: 'Hollywood Movie',
    225: 'Hit Songs',
    226: 'Custom Sneaker',
    227: 'Nature Sounds',
    228: 'Photoshop Tutorial',
    229: 'Hiking Adventure',
    230: 'Famous Kolkata',
    231: 'Smoke Bomb',
    232: 'DC Motor',
    233: 'Skincare Tips',
    234: 'Google Maps',
    235: 'Satisfying Art',
    236: 'Daily Vlog',
    237: 'General Knowledge',
    238: 'Street Food',
    239: 'CapCut Edit',
    240: 'Space Exploration',
    241: 'Amazing Facts',
    242: 'GTA Gamerz',
    243: 'Football Freestyle',
    244: 'Wildlife Conservation',
    245: 'Cricket Swing',
    246: 'Tanhai Dance',
    247: 'Horse Racing',
    248: 'Marvel Superheroes',
    249: 'iPhone Pro',
    250: 'Chess Masters',
    251: 'Weird History',
    252: 'Pottery Art',
    253: 'Pokemon League',
    254: 'FPV Drone Race',
    255: 'Quantum Computing',
    256: 'WhatsApp Status',
    257: 'Gacha Life',
    258: 'Earn Money',
    259: '3D Printing',
    260: 'Wedding Film',
    261: 'Football Stars',
    262: 'Skateboarding Tricks',
    263: 'Piano Tutorial',
    264: 'Telugu Cinema',
    265: 'Origami Craft',
    266: 'IIT vs JEE',
    267: 'Cake Decoration',
    268: 'Healthy Recipe',
    269: 'Yoga Flow',
    270: 'LEGO Set',
    271: 'Content Indicator',
    272: 'Minecraft Roleplay',
    273: 'Snake Ladder',
    274: 'Magic Band',
    275: 'Nail Art',
    276: 'RC Toy',
    277: 'Miniature Photography',
    278: 'Science Project',
    279: 'Magic Trick',
    280: 'Smartphone Test',
    281: 'Creative Knot',
    282: 'Family Feud',
    283: 'UPSC Interview',
    284: 'XXXTentacion Music',
    285: 'Joe Rogan Podcast',
    286: 'Aquarium Fish',
    287: 'Shiv Sneaker',
    288: 'Personal Branding',
    289: 'PBS Documentary',
    290: 'Skincare Routine',
    291: 'Motivational Quote',
    292: 'Hanuman Chalisa',
    293: 'Home Decor',
    294: 'Earn Money App',
    295: 'Hand Embroidery',
    296: 'Yin Studio Beat',
    297: 'Fox News Update',
    298: 'Beautiful Place',
    299: 'Zodiac Sign'
}

In [4]:
def add_descriptions_to_topics(file_path, descriptions):
    """
    Add topic descriptions to each topic in the topics.json file.

    :param file_path: Path to the topics.json file.
    :param descriptions: Dictionary with topic_id as keys and descriptions as values.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        topics_data = json.load(file)

    for topic in topics_data:
        topic_id = topic['topic_number']
        if topic_id in descriptions:
            topic['description'] = descriptions[topic_id]

    # Write the updated topics data back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(topics_data, file, ensure_ascii=False, indent=4)

# Example usage
add_descriptions_to_topics(file_path, topic_descriptions)


In [9]:
from elasticsearch import Elasticsearch, exceptions
import json

def upload_topics_to_elasticsearch(file_path, index_name='topics'):
    """
    Clears the existing 'topics' index and uploads topic number and description data to Elasticsearch.

    :param file_path: Path to the updated topics.json file.
    :param index_name: Name of the Elasticsearch index.
    """
    # Define database URL and credentials
    url = "http://localhost:9200/"
    username = "elastic"
    password = "gamUBg0KZZ0w5i6tikd0"

    # Setup the connection to Elasticsearch
    es = Elasticsearch(
        url,
        basic_auth=(username, password)
    )
    
    # Clear existing data in the index
    es.delete_by_query(index=index_name, body={"query": {"match_all": {}}})
    print("Cleared existing topics in Elasticsearch index.")
    
    with open(file_path, 'r', encoding='utf-8') as file:
        topics_data = json.load(file)

    for topic in topics_data:
        # Prepare document with only topic_number and description
        document = {
            'topic_number': topic['topic_number'],
            'description': topic['description']
        }

        try:
            response = es.index(index=index_name, id=topic['topic_number'], document=document)
            print(f"Uploaded topic {topic['topic_number']} to Elasticsearch: {response['_id']}")
        except exceptions.RequestError as e:
            print(f"Failed to upload topic {topic['topic_number']}. Error: {e}")
            print("Problematic topic data:", topic)
            break  # Remove break if you want to continue attempting to upload other topics


# Example usage
upload_topics_to_elasticsearch(file_path)


Cleared existing topics in Elasticsearch index.
Uploaded topic 83 to Elasticsearch: 83
Uploaded topic 213 to Elasticsearch: 213
Uploaded topic 238 to Elasticsearch: 238
Uploaded topic 218 to Elasticsearch: 218
Uploaded topic 40 to Elasticsearch: 40
Uploaded topic 11 to Elasticsearch: 11
Uploaded topic 133 to Elasticsearch: 133
Uploaded topic 91 to Elasticsearch: 91
Uploaded topic 225 to Elasticsearch: 225
Uploaded topic 17 to Elasticsearch: 17
Uploaded topic 77 to Elasticsearch: 77
Uploaded topic 129 to Elasticsearch: 129
Uploaded topic 51 to Elasticsearch: 51
Uploaded topic 223 to Elasticsearch: 223
Uploaded topic 297 to Elasticsearch: 297
Uploaded topic 241 to Elasticsearch: 241
Uploaded topic 39 to Elasticsearch: 39
Uploaded topic 165 to Elasticsearch: 165
Uploaded topic 203 to Elasticsearch: 203
Uploaded topic 141 to Elasticsearch: 141
Uploaded topic 70 to Elasticsearch: 70
Uploaded topic 119 to Elasticsearch: 119
Uploaded topic 45 to Elasticsearch: 45
Uploaded topic 258 to Elastic