In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Title: Global Condom Market Trends (2015-2025)

### Description: Analyzing global condom sales and awareness trends from 2015 to 2025, highlighting market growth, consumer behavior, and regional insights.

#### Import dataset

In [None]:
df = pd.read_csv('/kaggle/input/global-condom-sales-and-awareness-2015-2025/Rich_Global_Condom_Usage_Dataset.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.columns

## Data visualizations

In [None]:
# Plot Sales and Revenue Trends
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x="Year", y="Total Sales (Million Units)", label="Total Sales", marker="o")
sns.lineplot(data=df, x="Year", y="Market Revenue (Million USD)", label="Market Revenue", marker="s")

plt.xlabel("Year")
plt.ylabel("Sales & Revenue")
plt.title("Global Condom Sales & Market Revenue (2015-2025)")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x="Awareness Index (0-10)", y="Teen Pregnancy Rate (per 1000 teens)", hue="Country", palette="coolwarm")
plt.title("Awareness vs Teen Pregnancy Rate")
plt.xlabel("Awareness Index")
plt.ylabel("Teen Pregnancy Rate")
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(data=df, y="Most Popular Condom Type", order=df["Most Popular Condom Type"].value_counts().index, palette="Blues_r")
plt.title("Most Popular Condom Types Worldwide")
plt.xlabel("Count")
plt.ylabel("Condom Type")
plt.show()

In [None]:
online_sales = df["Online Sales (%)"].mean()
offline_sales = 100 - online_sales

plt.figure(figsize=(6, 6))
plt.pie([online_sales, offline_sales], labels=["Online", "Offline"], autopct="%1.1f%%", colors=["skyblue", "lightcoral"])
plt.title("Online vs Offline Condom Sales")
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=df, y="Brand Dominance", order=df["Brand Dominance"].value_counts().index, palette="Greens_r")
plt.title("Market Dominance of Condom Brands")
plt.xlabel("Market Share Count")
plt.ylabel("Brand")
plt.show()

In [None]:
# Function to clean and extract percentages
def extract_percentages(value):
    try:
        male_part, female_part = value.split(" - ")
        male_percentage = float(male_part.replace("% Male", "").strip())
        female_percentage = float(female_part.replace("% Female", "").strip())
        return male_percentage, female_percentage
    except Exception as e:
        print(f"Error processing value: {value}, Error: {e}")
        return None, None

# Apply function to extract percentages
df["Male Purchases (%)"], df["Female Purchases (%)"] = zip(*df["Male vs Female Purchases (%)"].apply(extract_percentages))

# Now plot the stacked bar chart
plt.figure(figsize=(10, 5))
df.groupby("Country")[["Male Purchases (%)", "Female Purchases (%)"]].mean().plot(
    kind="bar", stacked=True, colormap="coolwarm", figsize=(12, 6)
)

plt.title("Male vs Female Condom Purchases by Country")
plt.xlabel("Country")
plt.ylabel("Percentage")
plt.legend(["Male", "Female"])
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x="Country", y="Contraceptive Usage Rate (%)", palette="magma")
plt.xticks(rotation=90)
plt.title("Contraceptive Usage Rate by Country")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.violinplot(data=df, x="Sex Education Programs (Yes/No)", y="Awareness Index (0-10)", palette="pastel")
plt.title("Impact of Sex Education on Awareness Index")
plt.xlabel("Sex Education Programs")
plt.ylabel("Awareness Index")
plt.show()

## Predictive modelings

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Selecting Features and Target
target_col = "Sex Education Programs (Yes/No)"  # Example: Predicting presence of sex education
features = [col for col in df.columns if col != target_col]

In [None]:
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  

In [None]:
X = df[features]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Naïve Bayes": GaussianNB()
}

In [None]:
# Train and Evaluate Models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred) * 100  # Convert to percentage
    results[name] = accuracy
    print(f"{name}: {accuracy:.2f}% Accuracy")

In [None]:
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
print("\nModel Performance Ranking:")
for model, acc in sorted_results:
    print(f"{model}: {acc:.2f}% Accuracy")

In [None]:
# Sorting results for better visualization
sorted_results = dict(sorted(results.items(), key=lambda x: x[1], reverse=True))

# Plot bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(sorted_results.keys(), sorted_results.values(), color=["darkblue", "darkgreen", "darkred", "purple", "orange", "brown", "gray"])

# Annotate each bar with accuracy percentage
for bar in bars:
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f"{bar.get_height():.2f}%", ha='center', fontsize=12, fontweight="bold")

# Labels and title
plt.xlabel("ML Models")
plt.ylabel("Accuracy (%)")
plt.title("Model Performance Ranking")
plt.ylim(0, 100)  # Ensures all percentages fit in the chart
plt.xticks(rotation=25)
plt.grid(axis="y", linestyle="--", alpha=0.7)

# Show plot
plt.show()

## Thank you...pls upvote!!!!!