In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

# Display settings
sns.set(style="whitegrid")
np.random.seed(42)

# Parameters
num_records = 500
niches = ['Education', 'Fitness', 'Skincare', 'Technology', 'Fashion', 'Health']

# Generate synthetic influencer data
influencers = [f'influencer_{i+1}' for i in range(num_records)]
niche_choices = np.random.choice(niches, num_records)
followers = np.random.randint(5000, 1000000, num_records)
avg_likes = np.random.randint(100, 50000, num_records)
avg_comments = np.random.randint(10, 5000, num_records)
engagement_rate = (avg_likes + avg_comments) / followers
authenticity_score = np.round(np.random.uniform(0.5, 1.0, num_records), 2)

# Calculate Influencer Score (no Sentiment_Score)
influencer_score = np.round(
    0.6 * engagement_rate + 
    0.4 * authenticity_score, 4
)

# Create DataFrame
df = pd.DataFrame({
    'Influencer_ID': influencers,
    'Niche': niche_choices,
    'Followers': followers,
    'Avg_Likes': avg_likes,
    'Avg_Comments': avg_comments,
    'Engagement_Rate': engagement_rate.round(4),
    'Authenticity_Score': authenticity_score,
    'Influencer_Score': influencer_score
})

# Introduce missing values randomly
def insert_nulls(column_name, fraction=0.05):
    indices = np.random.choice(df.index, size=int(fraction * num_records), replace=False)
    df.loc[indices, column_name] = np.nan

# Insert nulls into selected columns
insert_nulls('Followers', 0.05)
insert_nulls('Avg_Likes', 0.04)
insert_nulls('Avg_Comments', 0.04)
insert_nulls('Authenticity_Score', 0.03)

# Recalculate engagement rate & influencer score after nulls
df['Engagement_Rate'] = (df['Avg_Likes'] + df['Avg_Comments']) / df['Followers']
df['Influencer_Score'] = np.round(
    0.6 * df['Engagement_Rate'] + 
    0.4 * df['Authenticity_Score'], 4
)

# Prepare features for Promotions Rate
promo_features = df[['Followers', 'Engagement_Rate', 'Influencer_Score']].copy()

# Normalize using MinMaxScaler (fill NaNs with 0 for normalization purposes)
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(promo_features.fillna(0))

# Weighted combination to calculate Promotions Rate
df['Promotions_Rate'] = np.round(
    0.5 * scaled_features[:, 2] +  # Influencer_Score
    0.3 * scaled_features[:, 1] +  # Engagement_Rate
    0.2 * scaled_features[:, 0],   # Followers
    4
)


df.to_csv("Insta_Influencer_Data_set.csv", index=False)