In [None]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to preprocess data
def preprocess_data(df):
    df = df.fillna("")  # Fill NaN values with an empty string
    return df.drop_duplicates()  # Remove duplicate rows

# Load bird dataset
bird_df = pd.read_csv("indian_bird_dataset_5000_updated.csv")
bird_df = preprocess_data(bird_df)

# Combine relevant features into a single text column for similarity comparison
bird_df["combined_features"] = (
    bird_df["City"] + " " + 
    bird_df["Season"] + " " + 
    bird_df["Most Active Time"] + " " + 
    bird_df["Region"] + " " + 
    bird_df["Primary Habitat"]
)

# Train TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(bird_df["combined_features"].values.astype("U"))

# Save model and vectorizer to pickle file
with open("bird_model.pkl", "wb") as model_file:
    pickle.dump((bird_df, vectorizer, tfidf_matrix), model_file)

# Function to get top matching birds
def get_top_birds(city, season, most_active_time, top_n=5):
    # Load model from pickle file
    with open("bird_model.pkl", "rb") as model_file:
        bird_df, vectorizer, tfidf_matrix = pickle.load(model_file)
    
    # User query formatted similarly
    user_query = city + " " + season + " " + most_active_time
    user_tfidf = vectorizer.transform([user_query])
    
    # Compute cosine similarity
    similarity_scores = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    
    # Get indices of top N matches
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    
    # Get the top matching birds along with relevant details
    result_df = bird_df.iloc[top_indices][[
        "Common Name", "Scientific Name", "City", "Season", "Most Active Time", 
        "Region", "Primary Habitat", "Endemic Status", "Conservation Status", "Unique Fact"
    ]].drop_duplicates()
    
    # Compute accuracy as the average similarity score of the top matches
    accuracy = similarity_scores[top_indices].mean() * 100  # Convert to percentage
    
    # Convert to HTML table format with custom styling
    table_html = result_df.to_html(index=False, escape=False, classes="styled-table")
    
    # Append accuracy information
    table_html += f"<p><strong>Accuracy of top {top_n} matches: {accuracy:.2f}%</strong></p>"
    
    return table_html

# Prompt user for input
user_city = input("Enter the City: ")
user_season = input("Enter the season: ")
user_most_active_time = input("Enter the most active time (Day, Night, Both): ")

# Get recommendations
top_birds_html = get_top_birds(user_city, user_season, user_most_active_time)

# Print HTML with custom styles
html_style = """
<style>
.styled-table {
    width: 100%;
    border-collapse: collapse;
    font-family: Arial, sans-serif;
}
.styled-table th {
    background-color: #4CAF50;
    color: white;
    padding: 10px;
    text-align: left;
    font-size: 16px;
}
.styled-table td {
    padding: 8px;
    border-bottom: 1px solid #ddd;
}
.styled-table tr:nth-child(even) {
    background-color: #f2f2f2;
}
.styled-table tr:nth-child(odd) {
    background-color: #ffffff;
}
</style>
"""
print(html_style + top_birds_html)
