In [None]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to preprocess data
def preprocess_data(df):
    df = df.fillna("")  # Fill NaN values with an empty string
    return df

# Load plant dataset
plant_df = pd.read_csv("indian_plant_dataset_15000_updated.csv")
plant_df = preprocess_data(plant_df)

# Combine relevant features into a single text column for similarity comparison
plant_df["combined_features"] = (
    plant_df["City"] + " " + 
    plant_df["Peak Season"] + " " + 
    plant_df["Most Active Time"] + " " + 
    plant_df["Region"] + " " + 
    plant_df["Medicinal Use"]
)

# Train TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(plant_df["combined_features"].values.astype("U"))

# Save model and vectorizer to pickle file
with open("plant_model.pkl", "wb") as model_file:
    pickle.dump((plant_df, vectorizer, tfidf_matrix), model_file)

# Function to get top matching plants
def get_top_plants(forest_type, peak_season, most_active_time, top_n=10):
    # Load model from pickle file
    with open("plant_model.pkl", "rb") as model_file:
        plant_df, vectorizer, tfidf_matrix = pickle.load(model_file)
    
    # User query formatted similarly
    user_query = forest_type + " " + peak_season + " " + most_active_time
    user_tfidf = vectorizer.transform([user_query])
    
    # Compute cosine similarity
    similarity_scores = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    
    # Get indices of top N matches
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    
    # Get the top matching plants along with relevant details
    result_df = plant_df.iloc[top_indices][[
        "Plant Name", "Scientific Name", "City", "Peak Season",
        "Height (m)", "Famous Forest", "Most Active Time", "Region", "Unique Fact",
        "Conservation Status", "Medicinal Use"
    ]]
    
    # Compute accuracy as the average similarity score of the top matches
    accuracy = similarity_scores[top_indices].mean() * 100  # Convert to percentage
    
    # Convert to HTML table format with custom styling
    table_html = result_df.to_html(index=False, escape=False, classes="styled-table")
    
    # Append accuracy information
    table_html += f"<p><strong>Accuracy of top {top_n} matches: {accuracy:.2f}%</strong></p>"
    
    return table_html

# Prompt user for input
user_forest_type = input("Enter the City: ")
user_peak_season = input("Enter the peak season: ")
user_most_active_time = input("Enter the most active time (Day, Night, Both): ")

# Get recommendations
top_plants_html = get_top_plants(user_forest_type, user_peak_season, user_most_active_time)

# Print HTML with custom styles
html_style = """
<style>
.styled-table {
    width: 100%;
    border-collapse: collapse;
    font-family: Arial, sans-serif;
}
.styled-table th {
    background-color: #4CAF50;
    color: white;
    padding: 10px;
    text-align: left;
    font-size: 16px;
}
.styled-table td {
    padding: 8px;
    border-bottom: 1px solid #ddd;
}
.styled-table tr:nth-child(even) {
    background-color: #f2f2f2;
}
.styled-table tr:nth-child(odd) {
    background-color: #ffffff;
}
</style>
"""
print(html_style + top_plants_html)
