## Install packages

In [None]:
!pip install -q git+https://github.com/huggingface/transformers -U
#!pip install accelerate
!pip install -q bitsandbytes

In [None]:
!pip install -q bitsandbytes

In [None]:
!pip install -U transformers

In [None]:
!pip install -q -U sentence-transformers

In [None]:
#!pip install faiss-cpu
!pip install -q faiss-gpu

In [None]:
!pip install -q --upgrade torch torchvision torchaudio transformers

# Import

In [None]:
import pandas as pd
import numpy as np
import os
import ast

In [None]:
import torch
import gc

import sys, random, string, re, time
from transformers import (BitsAndBytesConfig, 
                          AutoModelForCausalLM, 
                          AutoTokenizer, pipeline)
from tqdm.auto import tqdm

# Don't Show Warning Messages
import warnings
warnings.filterwarnings('ignore')

print(f"CUDA Version: {torch.version.cuda}")
print(f"Pytorch {torch.__version__}")

In [None]:
# Set a seed value

import torch, random

# Ensure that all GPU operations are deterministic 
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

seed_val = 1023

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Define variables

In [None]:
import huggingface_hub
hf_token = "Y"
huggingface_hub.login(token=hf_token, add_to_git_credential=False)

In [None]:
# set the path to the Gemma model hosted on Kaggle
MODEL_PATH = "distilbert/distilbert-base-uncased"

In [None]:
# set the path to the Gemma model hosted on Kaggle
#MODEL_PATH = "/kaggle/input/gemma/transformers/7b-it/1"

# set the path to the data that will be used in the few shot prompt
FEW_SHOT_DATA_PATH = '../input/gemma-comp-data/df_corrected_data.csv'

# set the path the text files containing info about Kaggle
KAGGLE_DATA_PATH = '../input/gemma-comp-data/rev4-cleaned-txt-kaggle/'

# the number of results from the vector search that will be reranked
TOP_K = 20

# the number of text chunks that will be passed to Gemma
NUM_CHUNKS_IN_CONTEXT = 3


# Define the device

In [None]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"Device: {DEVICE}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"Pytorch {torch.__version__}")

In [None]:
# Check the type and quantity of GPUs

if torch.cuda.is_available():
    print('Num CPUs:', os.cpu_count())
    print('Num GPUs:', torch.cuda.device_count())
    print('GPU Type:', torch.cuda.get_device_name(0))

# Helper functions

In [None]:
def run_faiss_search(query_text, top_k):
    
    """
    Executes an exhaustive search using FAISS to find the most 
    similar items to a given query.

    This function vectorizes the input query text using 
    a pre-defined model and then performs a search in a FAISS index 
    to retrieve the top_k most similar items. 
    It returns the indices of these items in the FAISS index, 
    which can be used to retrieve the corresponding documents
    or items.

    Parameters:
    - query_text (str): The text of the query for which similar 
    items are to be found.
    - top_k (int): The number of top similar items to retrieve.

    Returns:
    - index_vals_list (list of int): A list of indices for the top_k 
    most similar items found in the FAISS index. 
    These indices correspond to the positions of the items in 
    the dataset used to build the FAISS index.
    
    Note:
    - This function assumes that a FAISS index (`faiss_index`) 
    and a model for vectorization (`model`) are already defined 
    outside the function.
    - The function is designed for use with the Sentence Transformers
    package to convert text to vectors.
    
    """
    
    # Run FAISS exhaustive search
    query = [query_text]

    # Vectorize the query string
    query_embedding = model.encode(query, show_progress_bar=False)

    # Run the query
    # index_vals refers to the chunk_list index values
    scores, index_vals = faiss_index.search(query_embedding, top_k)
    
    # Get the list of index vals
    index_vals_list = index_vals[0]
    
    return index_vals_list
    

def run_rerank(index_vals_list, query_text):
    
    """
    Re-ranks a list of retrieved passages based on 
    their similarity to the input query using a cross-encoder.

    This function takes a list of index values corresponding to 
    retrieved passages and the input query text. 
    It then retrieves the actual text of these passages from a 
    dataframe (`df_data`) and formats them for input to a cross-encoder.
    The cross-encoder is then used to score the similarity between 
    each passage and the query. The passages are re-ranked
    based on these scores, and the re-ranked list of 
    passages is returned.

    Parameters:
    - index_vals_list (list of int): A list of index values 
    corresponding to retrieved passages.
    - query_text (str): The text of the query to be used 
    for re-ranking the passages.

    Returns:
    - pred_list (list of str): A list of re-ranked passages based 
    on their similarity to the query text.

    Note:
    - This function assumes that a dataframe (`df_data`) 
    containing the prepared text of passages and a 
    cross-encoder (`cross_encoder`) for scoring the similarity 
    between text pairs are already defined outside the function.
    """
    
    # Create a list of text chunks
    chunk_list = list(df_data['prepared_text'])

    # Replace the chunk index values with the corresponding strings
    pred_strings_list = [chunk_list[item] for item in index_vals_list]

    # Format the input for the cross encoder
    # The input to the cross_encoder is a list of lists
    # [[query_text, pred_text1], [query_text, pred_text2], ...]

    cross_input_list = []

    for item in pred_strings_list:
        
        # Create a question/chunk pair: [question, text_chunk]
        new_list = [query_text, item]
        
        # Append to the list containing all the question/chunk pairs
        # [[question, text_chunk], [question, text_chunk], ...]
        cross_input_list.append(new_list)


    # Put the pred text into a dataframe
    df = pd.DataFrame(cross_input_list, 
                      columns=['query_text', 'pred_text'])

    # Save the orginal index (i.e. df_data index values)
    df['original_index'] = index_vals_list

    # Now, score all retrieved passages using the cross_encoder
    cross_scores = cross_encoder.predict(cross_input_list, show_progress_bar=False)

    # Add the scores to the dataframe
    df['cross_scores'] = cross_scores

    # Sort the DataFrame in descending order based on the scores
    df_sorted = df.sort_values(by='cross_scores', ascending=False)
    
    # Reset the index
    df_sorted = df_sorted.reset_index(drop=True)

    pred_list = []

    for i in range(0,len(df_sorted)):
        
        # Get the text
        text = df_sorted.loc[i, 'pred_text']
        
        # Add curly braces
        item = {
            text
        }

        # Appen the text to a list
        pred_list.append(item)

    return pred_list

    
   
def vector_search_and_rerank(query_text, top_k=10):
    
    """
    Executes a retrieval-augmented generation (RAG) system 
    to generate responses to a given query.

    This function integrates FAISS for initial retrieval and 
    re-ranking using a cross-encoder to produce a list of responses 
    to the input query text. 
    First, it runs a FAISS exhaustive search to retrieve the top_k 
    most relevant passages based on the query. 
    Then, it re-ranks these passages using a cross-encoder
    to prioritize those with the highest similarity to the query. 
    The resulting list of passages is returned as the 
    output of the RAG system.

    Parameters:
    - query_text (str): The text of the query for which responses 
    are to be generated.
    - top_k (int, optional): The number of top passages to 
    retrieve and re-rank. Defaults to 10.

    Returns:
    - pred_list (list of str): A list of passages ranked and 
    generated by the RAG system in response to the query.

    Note:
    - This function assumes that `run_faiss_search` and `run_rerank` 
    functions are already defined. 
    These functions handle the initial retrieval and 
    re-ranking processes, respectively.
    """
    
    # Run a faiss exhaustive search
    pred_index_list = run_faiss_search(query_text, top_k)

    # This returns a list of dicts with length equal to top_k
    pred_list = run_rerank(pred_index_list, query_text)
    
    return pred_list

 

def extract_gemma_response(response):
    
    # Extract the answer:
    # Split and select the last item in the list
    response = response.split('<start_of_turn>model')[-1]
    # Remove leading and trailing spaces
    response = response.strip()
    # Remove the '<end_of_turn> token
    response = response.replace('<end_of_turn>', "")

    # Gemma always uses the phrase "I cannot answer this question"
    # when the answer is not available.
    text1 = 'I cannot answer this question'
    
    # If Gemma can't answer the question then
    # output a standard response.
    if text1 in response:
        response = "Sorry, that information is not available."
        
    return response


def format_text(text):

    # Create a list
    answer_list = text.split('\n')

    for i, item in enumerate(answer_list):

        # Replace * with nothing
        new_item = item.replace('*','')
        
        # Remove leading and trailing spaces
        new_item = new_item.strip()

        # Create the output string
        if i == 0:  
            fin_string = new_item + '\n'
        else:
            fin_string = fin_string + new_item + '\n'

    return fin_string


def gemma_assistant(question):
    
    # Create the prompt
    prompt = f"""<start_of_turn>user 
    Don't use Mardown to format your response.
    {question}<end_of_turn>
    <start_of_turn>model
    """

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    # Generate the outputs from prompt
    generate_ids = gemma_model.generate(**inputs, max_new_tokens=768)
    # Decode the generated output
    generated_text = tokenizer.batch_decode(generate_ids, 
                                        skip_special_tokens=True,
                                        clean_up_tokenization_spaces=False)[0]


    # Extract the answer
    response = generated_text.split('<start_of_turn>model')[-1]
    # Remove leading and trailing spaces
    response = response.strip()
    # Remove the '<end_of_turn> token
    response = response.replace('<end_of_turn>', "")
    
    # Remove markdown '*' symbols
    response = format_text(response)
    
    return response


def timer(start_time):

    # End timing
    end_time = time.time()
    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    # round to one decimal place
    elapsed_time = round(elapsed_time, 1)
    
    return elapsed_time

# Initialize gemma-7b-it


In [None]:
# Initialize the model and the tokenizer.
# (This step takes about 2 minutes)


# Set the compute data type to 16-bit floating point (float16).
# This is a more memory-efficient format than float32, 
# It lowers memory usage and can speed up computation.
compute_dtype = getattr(torch, "float16")


# Configure the model to use 4-bit precision for certain weights, 
# and specify the quantization details. This further reduces the 
# model size and can speed up inference.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

# Load the causal language model with the defined quantization 
# configuration and set it to automatically map 
# to the available device.
gemma_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH,
                                        device_map="auto",
                                        quantization_config=bnb_config)

# Disable caching of past key values for transformer models.
# This reduces memory usage in scenarios where past key values 
# aren't needed for subsequent predictions.
gemma_model.config.use_cache = False

# Set the pretraining throughput to 1.
gemma_model.config.pretraining_tp = 1

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)


# Ask Gemma questions about Kaggle

In [None]:
question = 'What is prompt seeking topic or areas'

# Create the prompt
prompt = f"""<start_of_turn>user
{question}<end_of_turn>
<start_of_turn>model
"""

# Start timing
start_time = time.time()

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
# Generate the outputs from prompt
generate_ids = gemma_model.generate(**inputs, max_new_tokens=768)
# Decode the generated output
generated_text = tokenizer.batch_decode(generate_ids, 
                                    skip_special_tokens=True,
                                    clean_up_tokenization_spaces=False)[0]


# Extract the answer

# Split and select the last item in the list
response = generated_text.split('<start_of_turn>model')[-1]
# Remove leading and trailing spaces
response = response.strip()
# Remove the '<end_of_turn> token
response = response.replace('<end_of_turn>', "")

# Remove markdown '*' symbols
# The deafult Markdown that Gemma outputs
# doesn't always display well.
response = format_text(response)


# Get the inference time
elapsed_time = timer(start_time)
print(f"Time taken: {elapsed_time} seconds")

print()
print('User:\n',question)
print()
print('Gemma:\n', response)


<hr>
This answer looks quite good. Let's put the above code into a function called gemma_assistant() and ask Gemma a few more questions.

## Read all the txt files

In [None]:
df = pd.read_csv('/kaggle/input/translated-merged-first-prompt/translated_merged_first_prompt_ans_code.csv')

df.head(3)

In [None]:
ques = "Just name 3/4 software engineering principles where the user needs assistance. NO EXPLANATION"

In [None]:
prompt = df['Translated_First Prompt']

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
def hello_world(question, context):   
    # Create the prompt
    prompt = f"""<start_of_turn>user
    Context: {context}
    Question: {question}<end_of_turn>
    <start_of_turn>model
    """


    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    # Generate the outputs from prompt
    generate_ids = gemma_model.generate(**inputs, max_new_tokens=768)
    # Decode the generated output
    response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True,
                                         clean_up_tokenization_spaces=False)[0]


    # Extract the answer

    # Split and select the last item in the list
    gemma_response = response.split('<start_of_turn>model')[-1]
    # Remove leading and trailing spaces
    gemma_response = gemma_response.strip()
    # Remove the '<end_of_turn> token
    gemma_response= gemma_response.replace('<end_of_turn>', "")
    
    # Clear the memory to create space
    del prompt
    del inputs
    del generate_ids
    torch.cuda.empty_cache() 
    gc.collect()
        
    return gemma_response

# Cleaning

In [None]:
def clean_text(text):
    # Remove mentions of npx and npm start
    text = re.sub(r'`npx [\w-]+`', '', text)
    text = re.sub(r'`npm start [\w\s\-]+`', '', text)
    
    # Remove redundant information
    text = re.sub(r'(?<=\.\.\.)\n', '', text)  # remove unnecessary line breaks
    
    # Fix some sections for consistency (example: 'For self-development...')
    text = text.replace('For self-development from the git repo: ', '')
    
    return text.strip()

In [None]:
context = prompt[453]

In [None]:
context1 = "Your account 0002*****8534 has been debited from HEAD OFFICE for BDT 15,000.00 on 23-MAR-2025 10:24 AM. A/C Bal BDT 1007"

In [None]:
context = clean_text(context)

In [None]:
context 

In [None]:
len(prompt)

In [None]:
ques = """Just give me numbers related to currency and also mention debit/credit and current balance. No extra lines.""\

Follow this template:
Credit/Debit: X
Balance: Y"""
ans = hello_world(ques, prompt[0])
#print(ques)
try:
    res = [line.split('. ', 1)[1] for line in ans.split('\n') if '. ' in line]
    print(res)
except Exception as e:
    print("Error processing response:", e)

In [None]:
# Set pandas display options to show full dataframe
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)   # Show all columns
pd.set_option('display.width', None)         # Auto-adjust width
pd.set_option('display.max_colwidth', None)

# Generate Topics (Main Logic)

In [None]:
from tqdm import tqdm
import pandas as pd

# Create an empty list to store results
results = []

for i in tqdm(range(0, 3395), desc="Processing"):
    ans = hello_world(ques, prompt[i])
    try:
        res = [line.split('. ', 1)[1] for line in ans.split('\n') if '. ' in line]
        results.append({"Answer": res})
    except Exception as e:
        print("Error processing response:", e)
    
# Convert the list to a DataFrame
df_with_context = pd.DataFrame(results)
df_with_context

In [None]:
df_with_context.to_csv("results.csv", index=False) 

# All Topics Analysis

In [None]:
import pandas as pd
topics = pd.read_csv('/kaggle/input/prompt-all-topics/all_topics.csv')

topics.head(20)

In [None]:
type(topics['Answer'])

In [None]:
# Flatten and join all list items into a single comma-separated string
result = ', '.join(topics['Answer'].explode())

In [None]:
print(result)

In [None]:
import re

# Remove square brackets, **, and single quotes
cleaned_text = re.sub(r"[\[\]'\*]", "", result)

print(cleaned_text)

In [None]:
# Convert to list
list_items = [item.strip() for item in cleaned_text.split(',') if item.strip()]

In [None]:
len(list_items)

In [None]:
list_items

In [None]:
from collections import Counter
# Count frequency of each item
frequency = Counter(list_items)
frequency

In [None]:
type(frequency)

In [None]:
len(frequency)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy.cluster.hierarchy as sch

# Get the first 20 items based on frequency
top_20 = dict(frequency.most_common(20))

# Plot the frequency
plt.figure(figsize=(10, 6))
plt.bar(top_20.keys(), top_20.values(), color='skyblue')

# Customize the plot
plt.title('Top 30 Most Frequent Items', fontsize=16)
plt.xlabel('Items', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
top_40 = dict(frequency.most_common(40))

In [None]:
top_40

# Merging

In [None]:
from transformers import pipeline
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

In [None]:
# Candidate labels for classification (representing key concepts)
candidate_labels = [
    "Modularization", "Single Responsibility Principle (SRP)", "Design Patterns",
    "Abstraction", "Encapsulation", "DRY Principle", "State Management",
    "Agile Development", "Testing", "Software Engineering Principles", "Data Structures"
]

# Add any missing labels from categories to candidate_labels
new_labels = [
    "Open-source Software Development Principles", "Algorithms"
    "Community-driven Learning and Knowledge Sharing", "Loose Coupling",
    "Async/Await vs Promises", "Polymorphism", "Event Handling", "Separation of Concerns",
    "Dependency Injection"
]

# Update candidate_labels with new labels if not already present
for label in new_labels:
    if label not in candidate_labels:
        candidate_labels.append(label)

# Load a specific model for zero-shot classification and use GPU (device=0)
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=0)

# Dictionary to store merged frequencies
merged_categories = defaultdict(int)

# Classify each category and merge frequencies based on the most relevant label
for category, freq in tqdm(top_40.items(), desc="Processing Categories", unit="category"):
    result = classifier(category, candidate_labels)
    # Find the label with the highest score
    best_label = result['labels'][0]
    merged_categories[best_label] += freq

# Sort merged categories by frequency (high to low)
merged_categories = dict(sorted(merged_categories.items(), key=lambda item: item[1], reverse=True))

# Display the sorted merged categories
merged_categories

In [None]:
type(merged_categories)

In [None]:
import plotly.express as px

# Extract items and their frequencies from merged_categories
items = [str(item) for item in merged_categories.keys()]  # Ensure items are strings
frequencies = list(merged_categories.values())  # Extract frequencies

# Create polar bar chart
fig = px.bar_polar(r=frequencies, theta=items, color=frequencies, 
                   color_continuous_scale='Viridis', hover_name=items, 
                   hover_data={'Frequency': frequencies})

# Update layout for better visualization
fig.update_layout(
    title='Items by Frequency',
    polar={'radialaxis': {'visible': True, 'title': 'Frequency'}},
    showlegend=False
)

# Display the figure
fig.show()


In [None]:
import plotly.express as px
import pandas as pd
from collections import Counter
import plotly.io as pio

# Ensure compatibility with Kaggle Notebook
pio.renderers.default = "iframe"  # Use 'iframe' for Kaggle

# Use 'merged_categories' as the data source
# Extract items and frequencies from merged_categories
labels = list(merged_categories.keys())
values = list(merged_categories.values())

# Create DataFrame
df = pd.DataFrame({'Item': labels, 'Frequency': values})

# Create Sunburst Chart
fig = px.sunburst(df, path=['Item'], values='Frequency',
                  #title="Items with Highest Frequency",
                  color='Frequency', color_continuous_scale='Blues')

# Update layout for better visualization and font sizing
fig.update_layout(
    title_font=dict(size=24),  # Title font size
    xaxis_title_font=dict(size=18),  # X-axis title font size
    yaxis_title_font=dict(size=18),  # Y-axis title font size
    font=dict(size=14),  # General font size
    autosize=True,
    margin=dict(l=20, r=20, t=40, b=40),  # Adjust margins for better readability
    width=1000,  # Width of the graph
    height=800   # Height of the graph (increased for Sunburst chart for better view)
)

# Save the plot as PNG (Requires Kaleido)
fig.write_image("sunburst_items_with_highest_frequency.png")

# Show interactive plot
fig.show()


In [None]:
pip install -U kaleido

In [None]:
import plotly.express as px
import pandas as pd
from collections import Counter
import plotly.io as pio

# Ensure compatibility with Kaggle Notebook
pio.renderers.default = "iframe"  # Use 'iframe' for Kaggle

# Use 'merged_categories' as the data source
# Extract items and frequencies from merged_categories
labels = list(merged_categories.keys())
values = list(merged_categories.values())

# Create DataFrame
df = pd.DataFrame({'Item': labels, 'Frequency': values})

# Create Interactive Bar Chart with Hover Effects
fig = px.bar(df, x='Frequency', y='Item',
             #title="Items with Highest Frequency",
             orientation='h',
             color='Frequency',
             color_continuous_scale='Viridis',
             text_auto=True)

# Update layout for better visualization and font sizing
fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title="Frequency",
    yaxis_title="",
    template="plotly_white",
    title_font=dict(size=24),  # Title font size
    xaxis_title_font=dict(size=18),  # X-axis title font size
    yaxis_title_font=dict(size=18),  # Y-axis title font size
    font=dict(size=14),  # General font size
    autosize=True,
    margin=dict(l=20, r=20, t=40, b=40)  # Adjust margins for better readability
)

# Resize the graph to a suitable size for research papers
fig.update_layout(
    width=1000,  # Width of the graph
    height=600   # Height of the graph
)

# Save the plot as PDF
fig.write_image("items_with_highest_frequency.pdf")

# Show interactive plot
fig.show()


In [None]:
import plotly.express as px
import pandas as pd
from collections import Counter
import plotly.io as pio

# Ensure compatibility with Kaggle Notebook
pio.renderers.default = "iframe"  # Use 'iframe' for Kaggle

# Use 'merged_categories' as the data source
# Extract items and frequencies from merged_categories
labels = list(merged_categories.keys())
values = list(merged_categories.values())

# Create DataFrame
df = pd.DataFrame({'Item': labels, 'Frequency': values})

# Create an Interactive Bubble Chart
fig = px.scatter(df, x='Frequency', y='Item',
                 size='Frequency', color='Frequency',
                 #title="Items with Highest Frequency",
                 color_continuous_scale='Viridis',
                 hover_name='Item',
                 size_max=30)

# Update layout to make the chart full view
fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title="Frequency",
    yaxis_title="",
    template="plotly_white",
    autosize=True,
    margin=dict(l=40, r=40, t=50, b=50),
    showlegend=False,
    font=dict(
        family="Arial",
        size=16  # Increase font size
    ),
    title_font=dict(
        size=20  # Increase title font size
    )
)

# Save the figure as a PDF file
fig.write_image("bubble_chart.pdf", width=1000, height=800)

# Show interactive plot
fig.show()


In [None]:
import plotly.graph_objects as go
import pandas as pd
from collections import Counter
import plotly.io as pio

# Ensure compatibility with Kaggle Notebook
pio.renderers.default = "iframe"  # Use 'iframe' for Kaggle

# Use 'merged_categories' as the data source
# Extract items and frequencies from merged_categories
labels = list(merged_categories.keys())
values = list(merged_categories.values())

# Create DataFrame
df = pd.DataFrame({'Item': labels, 'Frequency': values})

# Create a Scatter plot with lines connecting the points
fig = go.Figure()

# Add scatter points (bubbles)
fig.add_trace(go.Scatter(
    x=df['Frequency'], y=df['Item'],
    mode='markers',
    marker=dict(
        size=df['Frequency'],
        color=df['Frequency'],
        colorscale='Viridis',
        showscale=True
    ),
    text=df['Item'],  # Hover text
    name='Items'
))

# Add lines connecting the points
for i in range(1, len(df)):
    fig.add_trace(go.Scatter(
        x=[df['Frequency'][i-1], df['Frequency'][i]],
        y=[df['Item'][i-1], df['Item'][i]],
        mode='lines',
        line=dict(color='gray', width=1),
        name='Lines'
    ))

# Update layout for better visualization, font sizing, and graph resizing
fig.update_layout(
    #title="Items with Highest Frequency",
    xaxis_title="Frequency",
    yaxis_title="",
    template="plotly_white",  # Light theme
    title_font=dict(size=24),  # Title font size
    xaxis_title_font=dict(size=18),  # X-axis title font size
    yaxis_title_font=dict(size=18),  # Y-axis title font size
    font=dict(size=14),  # General font size
    autosize=True,  # Automatically adjust size
    margin=dict(l=20, r=20, t=40, b=40),  # Reduce margins
    showlegend=False,
    xaxis=dict(
        range=[0, max(df['Frequency']) + 10]  # Set x-axis range starting from 0
    ),
    yaxis=dict(
        tickmode='array',  # Ensure all labels are displayed
    ),
    width=1000,  # Width of the graph
    height=600   # Height of the graph
)

# Save the plot as PNG (Requires Kaleido)
fig.write_image("scatter_items_with_highest_frequency.png")

# Show interactive plot
fig.show()


In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import plotly.io as pio

# Ensure compatibility with Kaggle Notebook
pio.renderers.default = "iframe"  # Use 'iframe' for Kaggle

# Use 'merged_categories' as the data source
# Extract items and frequencies from merged_categories
labels = list(merged_categories.keys())
values = list(merged_categories.values())

# Number of nodes (items)
num_nodes = len(labels)

# Create angles for circular layout
angles = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)

# Create positions (x, y) for each node in a circular layout
x_pos = np.cos(angles)
y_pos = np.sin(angles)

# Create a DataFrame to map positions with node labels and values
df = pd.DataFrame({'Item': labels, 'Frequency': values, 'X': x_pos, 'Y': y_pos})

# Create scatter plot for nodes (categories)
node_trace = go.Scatter(
    x=df['X'], y=df['Y'],
    mode='markers+text',
    text=df['Item'],
    textposition='top center',
    marker=dict(
        size=df['Frequency'] / 10,  # Adjust size for better visual scaling
        color=df['Frequency'],
        colorscale='Viridis',
        showscale=True
    ),
)

# Create connecting lines (edges) between nodes
edge_traces = []
for i in range(1, num_nodes):
    edge_traces.append(go.Scatter(
        x=[df['X'][i-1], df['X'][i]],
        y=[df['Y'][i-1], df['Y'][i]],
        mode='lines',
        line=dict(color='gray', width=1),
        name='Lines'
    ))

# Combine all traces (edges and nodes)
fig = go.Figure(data=edge_traces + [node_trace])

# Update layout for the circular graph with larger font and resized graph
fig.update_layout(
    #title="Galapagos-like Graph for Merged Categories",
    showlegend=False,
    xaxis=dict(showgrid=False, zeroline=False, visible=False),
    yaxis=dict(showgrid=False, zeroline=False, visible=False),
    template="plotly_white",  # Light theme
    autosize=True,
    margin=dict(l=40, r=40, t=50, b=50),
    title_font=dict(size=24),  # Increase title font size
    font=dict(size=18),  # Increase label and text font size
)

# Show interactive plot
fig.show()

# Save the figure as PDF
fig.write_image("circular_graph.pdf", width=1200, height=800)  # Resize image for research paper


In [None]:
import plotly.graph_objects as go
import pandas as pd
import plotly.io as pio

# Ensure compatibility with Kaggle Notebook
pio.renderers.default = "iframe"  # Use 'iframe' for Kaggle

# Use 'merged_categories' as the data source
# Extract items and frequencies from merged_categories
labels = list(merged_categories.keys())
values = list(merged_categories.values())

# Create DataFrame
df = pd.DataFrame({'Item': labels, 'Frequency': values})

# Create the Spider (Radar) Chart
fig = go.Figure()

# Add data trace (Frequency values for each category)
fig.add_trace(go.Scatterpolar(
    r=df['Frequency'],  # Frequency values as radial distance
    theta=df['Item'],   # Category names as angular positions
    fill='toself',      # Fill the area inside the plot
    line=dict(color='blue'),  # Line color
    marker=dict(size=8),  # Marker size at each point
    name="Categories Frequency"
))

# Update layout for better appearance with larger fonts and resized chart
fig.update_layout(
    #title="Spider (Radar) Chart for Merged Categories",
    polar=dict(
        radialaxis=dict(
            visible=True,  # Show radial axis
            range=[0, max(df['Frequency']) + 10]  # Set the range of radial axis
        ),
        angularaxis=dict(
            tickmode='array',  # Set the angular ticks based on the categories
            tickvals=list(range(len(df['Item']))),  # Tick positions for categories
            ticktext=df['Item']  # Labels for each category
        )
    ),
    title_font=dict(size=24),  # Increase title font size
    font=dict(size=14),  # Increase label and text font size
    template="seaborn",
    showlegend=False,
    autosize=True,
    margin=dict(l=40, r=40, t=50, b=50)  # Adjust margins for better spacing
)

# Show interactive plot
fig.show()

# Save the figure as PNG
fig.write_image("spider_chart.png", width=1200, height=800)  # Resize image for research paper


In [None]:
import plotly.graph_objects as go
import pandas as pd
import plotly.io as pio

# Ensure compatibility with Kaggle Notebook
pio.renderers.default = "iframe"  # Use 'iframe' for Kaggle

# Use 'merged_categories' as the data source
# Extract items and frequencies from merged_categories
labels = list(merged_categories.keys())
values = list(merged_categories.values())

# Create DataFrame
df = pd.DataFrame({'Item': labels, 'Frequency': values})

# Create a Stock Chart (Line or Candlestick chart)
fig = go.Figure()

# Add a line chart to simulate stock movement
fig.add_trace(go.Scatter(
    x=df['Item'],  # Categories as the x-axis
    y=df['Frequency'],  # Frequencies as the y-axis
    mode='lines+markers',  # Line chart with markers
    name="Categories Frequency",
    line=dict(color='blue', width=2),
    marker=dict(size=6, color='red', symbol='circle')
))

# Optionally, add a candlestick chart for each category (just for illustrative purposes)
fig.add_trace(go.Candlestick(
    x=df['Item'], 
    open=df['Frequency'], 
    high=df['Frequency'] + 10,  # Adding a little variance to simulate high-low
    low=df['Frequency'] - 10, 
    close=df['Frequency'],
    increasing_line_color='green',
    decreasing_line_color='red',
    name="Category Frequency Range"
))

# Update layout for better appearance with larger fonts and resized chart
fig.update_layout(
    #title="Stock Chart for Merged Categories",
    title_font=dict(size=24),  # Increase title font size
    xaxis_title="",
    yaxis_title="Frequency",
    font=dict(size=14),  # Increase label and text font size
    template="plotly_white",  # White theme
    showlegend=True,
    xaxis=dict(
        tickmode='linear', 
        tickvals=list(range(len(df['Item']))), 
        ticktext=df['Item'],
        tickangle=45  # Rotate the x-axis labels for better visibility
    ),
    yaxis=dict(
        title="Frequency",
        title_font=dict(size=16),
    ),
    autosize=True,
    margin=dict(l=40, r=40, t=50, b=100)  # Adjust margins for better spacing
)

# Show interactive plot
fig.show()

# Save the figure as PNG
fig.write_image("stock_chart.png", width=1200, height=800)  # Resize image for research paper


In [None]:
import plotly.graph_objects as go
import pandas as pd
import plotly.io as pio

# Ensure compatibility with Kaggle Notebook
pio.renderers.default = "iframe"  # Use 'iframe' for Kaggle

# Use 'merged_categories' as the data source
# Extract items and frequencies from merged_categories
labels = list(merged_categories.keys())
values = list(merged_categories.values())

# Create DataFrame
df = pd.DataFrame({'Item': labels, 'Frequency': values})

# Create a Line Graph with filled area underneath
fig = go.Figure()

# Add a line chart with filled area underneath
fig.add_trace(go.Scatter(
    x=df['Item'],  # Categories as the x-axis
    y=df['Frequency'],  # Frequencies as the y-axis
    mode='lines',  # Line chart
    fill='tozeroy',  # Fill area underneath the line
    fillcolor='rgba(0, 123, 255, 0.3)',  # Fill color with transparency
    line=dict(color='blue', width=2),  # Line color and width
    name="Category Frequency"
))

# Update layout for better appearance with a white template
fig.update_layout(
    title="Line Graph with Filled Area Underneath",
    xaxis_title="Category",
    yaxis_title="Frequency",
    template="plotly_white",  # White theme for the plot
    showlegend=False,  # Hide legend since it's only one line
    xaxis=dict(tickmode='linear', tickvals=list(range(len(df['Item']))), ticktext=df['Item']),
    plot_bgcolor='rgba(255, 255, 255, 1)',  # White background for the plot area
)

# Show interactive plot
fig.show()


In [None]:
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
import numpy as np

# Convert dictionary into a list of values and labels
categories = list(merged_categories.keys())
values = list(merged_categories.values())

# Reshape values for hierarchical clustering
values_reshaped = np.array(values).reshape(-1, 1)

# Perform hierarchical clustering
linked = sch.linkage(values_reshaped, method='ward')

# Create a more visually appealing figure
plt.figure(figsize=(12, 8), facecolor='#f5f5f5')  # Light grey background
ax = plt.gca()
ax.set_facecolor('#ffffff')  # White plot background

# Create dendrogram with custom styling
dendrogram = sch.dendrogram(
    linked,
    labels=categories,
    orientation='right',  # Kept as 'right' per your original code
    distance_sort='descending',  # Sort by distance for better structure
    color_threshold=0,
    above_threshold_color='#2b2d42',  # Dark blue-grey for main branches
    leaf_rotation=0,  # No rotation for straight labels
    leaf_font_size=10,
    link_color_func=lambda k: ['#ef476f', '#ffd166', '#06d6a0', '#118ab2'][k % 4]  # Vibrant colors for links
)

# Customize the plot
plt.title("Hierarchical Clustering of Merged Categories",
          fontsize=18,
          fontweight='bold',
          pad=20,
          color='#2b2d42')

plt.xlabel("Euclidean Distance",  # Swapped labels due to 'right' orientation
           fontsize=14,
           fontstyle='italic',
           color='#2b2d42')

plt.ylabel("Categories",  # Swapped labels due to 'right' orientation
           fontsize=14,
           fontstyle='italic',
           color='#2b2d42')

# Customize axes with dark color instead of brown
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_color('#1a1a1a')  # Dark grey-black
ax.spines['bottom'].set_color('#1a1a1a')  # Dark grey-black
ax.tick_params(axis='y', colors='#1a1a1a')  # Dark grey-black for ticks
ax.yaxis.set_tick_params(labelsize=12)

# Add a grid for better readability with dark color
ax.grid(True, axis='y', linestyle='--', alpha=0.3, color='#1a1a1a')  # Dark grey-black

# Adjust layout to prevent label cutoff
plt.tight_layout(pad=3.0)

# Show the plot
plt.show()