In [584]:
!pip install scikit-learn
!pip install deap
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')


import warnings
warnings.filterwarnings('ignore')



In [585]:
df=pd.read_csv('/content/social_media_posts.csv')

In [586]:
df.head(200)

Unnamed: 0,post_id,user_id,post_text,hashtags,post_time,post_type,follower_count,likes,comments,shares,engagement_score,is_trending,day_of_week
0,post_1,user_471,Imagine mention general beat discover lose say...,"#budget,#model,#poor,#turn,#treatment",2025-04-12 04:49:01,video,943486,597,311,434,494.9,0,Saturday
1,post_2,user_91,At use development various claim gas find abou...,"#thus,#again,#black",2025-04-13 07:18:09,video,315298,2389,305,217,1546.6,1,Sunday
2,post_3,user_1578,Young factor no third at to probably here earl...,"#service,#picture,#sister,#environmental,#teacher",2025-04-21 00:50:43,text,926939,1770,514,303,1246.5,1,Monday
3,post_4,user_360,Field form executive close both interesting me...,"#say,#stop,#for",2025-04-11 11:39:50,video,383765,1914,196,907,1297.9,1,Friday
4,post_5,user_867,Community organization cover industry stage on...,"#real,#station,#allow",2025-04-04 05:53:32,text,434684,2123,23,562,1336.9,1,Friday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,post_196,user_705,Dark within describe bar nation happen some ol...,"#fight,#really,#toward",2025-04-26 21:38:40,image,580638,1967,653,460,1422.1,1,Saturday
196,post_197,user_259,During time about apply.,"#open,#his,#growth,#hope",2025-04-22 20:09:28,image,703502,3133,731,632,2162.3,1,Tuesday
197,post_198,user_131,Full too benefit interest draw.,"#threat,#performance,#often,#choose",2025-04-20 12:06:56,image,144087,3243,859,154,2218.9,1,Sunday
198,post_199,user_595,Rate language traditional try image size assum...,#capital,2025-04-21 08:41:15,video,110066,2019,66,807,1311.9,1,Monday


In [588]:
#Convert 'post_time' to datetime format
df['post_time'] = pd.to_datetime(df['post_time'], errors='coerce')

# Extract time features
df['hour'] = df['post_time'].dt.hour
df['day'] = df['post_time'].dt.dayofweek

In [589]:
#Split hashtags into a list for easier analysis
df['hashtags'] = df['hashtags'].apply(lambda x: [tag.strip() for tag in x.split('#') if tag.strip()])

In [590]:
#Check for any invalid datetime values after conversion
invalid_dates = df[df['post_time'].isnull()]

#Summary of numeric columns for outlier detection
numeric_summary = df.describe()

In [591]:
# Output results for verification
print("Invalid Dates:")
print(invalid_dates)
print("\nSummary of Numeric Columns:")
print(numeric_summary)
print("\nSample of Cleaned Data:")
print(df.head())

Invalid Dates:
Empty DataFrame
Columns: [post_id, user_id, post_text, hashtags, post_time, post_type, follower_count, likes, comments, shares, engagement_score, is_trending, day_of_week, hour, day]
Index: []

Summary of Numeric Columns:
                           post_time  follower_count         likes  \
count                          10000    10000.000000  10000.000000   
mean   2025-04-16 11:14:17.022299904   499642.685300   2499.031600   
min              2025-04-01 11:47:26      100.000000      1.000000   
25%    2025-04-08 20:50:34.249999872   251116.250000   1260.000000   
50%       2025-04-16 11:30:32.500000   499240.000000   2480.500000   
75%              2025-04-23 20:45:25   749787.000000   3761.000000   
max              2025-05-01 11:45:31   999936.000000   4999.000000   
std                              NaN   287204.964531   1444.659882   

          comments        shares  engagement_score   is_trending  \
count  10000.00000  10000.000000      10000.000000  10000.000000

In [592]:
# Prepare features and target variables
features = df[['follower_count', 'hour', 'day', 'hashtags', 'post_type']]
targets = df[['likes', 'comments', 'shares','engagement_score','is_trending']]

In [593]:
# Convert hashtags into a single string
features['hashtags'] = features['hashtags'].apply(lambda x: ' '.join(x))

In [594]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.3, random_state=42)

In [595]:
# Add post_type to features and retrain
features['post_type'] = df['post_type']
# Define preprocessing for hashtags (OneHotEncoding)
preprocessor = ColumnTransformer(
    transformers=[
        ('hashtags', OneHotEncoder(handle_unknown='ignore'), ['hashtags']),
         ('post_type', OneHotEncoder(handle_unknown='ignore'), ['post_type'])
    ],
    remainder='passthrough'
)


In [596]:
# Define the model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])


In [597]:
# Train the model for 'likes' prediction
pipeline.fit(X_train, y_train['likes'])

In [598]:
# Predict and evaluate for 'likes'
y_pred = pipeline.predict(X_test)
mse_likes = mean_squared_error(y_test['likes'], y_pred)

print(f"Mean Squared Error for Likes: {mse_likes}")

Mean Squared Error for Likes: 2293513.7251072335


In [599]:
# Train separate models for likes, comments, and shares
models = {}
for target in ['likes', 'comments', 'shares']:
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ])
    pipeline.fit(X_train, y_train[target])
    models[target] = pipeline


In [600]:
# Evaluate the models
for target in ['likes', 'comments', 'shares']: # Removed 'engagement_score' and 'is_trending'
    y_pred = models[target].predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)
    print(f"Mean Squared Error for {target.capitalize()}: {mse}")

Mean Squared Error for Likes: 2293513.7251072335
Mean Squared Error for Comments: 92077.8155518
Mean Squared Error for Shares: 92474.50457983333


In [601]:
# Ensure consistent hashtag preprocessing during input
def preprocess_hashtags(hashtags):
    return ' '.join(hashtags)

# Prediction function with additional features
def predict_post(models, follower_count, post_hour, post_day, hashtags, post_type):
    input_data = pd.DataFrame({
        'follower_count': [follower_count],
        'hour': [post_hour],
        'day': [post_day],
        'hashtags': [preprocess_hashtags(hashtags)],
        'post_type': [post_type]
    })
    predictions = {target: models[target].predict(input_data)[0] for target in models}
    engagement_score = (predictions['likes'] + predictions['comments'] + predictions['shares']) / follower_count
    is_trending = engagement_score > 0.1  # Define threshold for trending
    predictions['engagement_score'] = engagement_score
    predictions['is_trending'] = is_trending
    return predictions

In [604]:
# Example usage of the updated prediction function
new_post = {
    'follower_count':8300,
    'post_hour': 7,  # 3:00 PM
    'post_day':4,    # Wednesday
    'hashtags': ['say',' stop'] ,  # Properly preprocessed
    'post_type': 'video'  # Example post type
}

predicted_engagement = predict_post(
    models=models,
    follower_count=new_post['follower_count'],
      post_hour=new_post['post_hour'],
    post_day=new_post['post_day'],
    hashtags=new_post['hashtags'],
    post_type=new_post['post_type']
)

# Display the predictions
print("\nPredicted Engagement for New Post:")
print(f"Likes: {predicted_engagement['likes']}")
print(f"Comments: {predicted_engagement['comments']}")
print(f"Shares: {predicted_engagement['shares']}")
print(f"Engagement Score: {predicted_engagement['engagement_score']}")
print(f"Is Trending: {predicted_engagement['is_trending']}")



Predicted Engagement for New Post:
Likes: 2571.26
Comments: 535.65
Shares: 578.17
Engagement Score: 0.44398554216867475
Is Trending: True


In [606]:
import random
import numpy as np

# Define the genetic algorithm
def genetic_algorithm(models, follower_count, post_type, generations=20, population_size=50, mutation_rate=0.1):
    # Define the possible hashtags and time slots
    hashtags_pool = ['budget', 'model', 'poor',' turn',' treatment','real', 'station', 'allow']
    hours_pool = list(range(1, 24))
    days_pool = list(range(1, 7))

    # Initialize population
    def create_individual():
        return {
            'hashtags': random.sample(hashtags_pool, random.randint(1, 3)),
            'hour': random.choice(hours_pool),
            'day': random.choice(days_pool)
        }

    population = [create_individual() for _ in range(population_size)]

    # Evaluate fitness
    def fitness(individual):
        predictions = predict_post(
            models=models,
            follower_count=follower_count,
            post_hour=individual['hour'],
            post_day=individual['day'],
            hashtags=individual['hashtags'],
            post_type=post_type
        )
        return predictions['engagement_score']

    # Selection
    def select(population):
        sorted_population = sorted(population, key=fitness, reverse=True)
        return sorted_population[:int(len(sorted_population) / 2)]

    # Crossover
    def crossover(parent1, parent2):
        child = {
            'hashtags': random.choice([parent1['hashtags'], parent2['hashtags']]),
            'hour': random.choice([parent1['hour'], parent2['hour']]),
            'day': random.choice([parent1['day'], parent2['day']])
        }
        return child

    # Mutation
    def mutate(individual):
        if random.random() < mutation_rate:
            individual['hashtags'] = random.sample(hashtags_pool, random.randint(1, 3))
        if random.random() < mutation_rate:
            individual['hour'] = random.choice(hours_pool)
        if random.random() < mutation_rate:
            individual['day'] = random.choice(days_pool)
        return individual

    # Run the GA
    for generation in range(generations):
        # Evaluate and select
        population = select(population)

        # Create the next generation
        next_generation = []
        while len(next_generation) < population_size:
            parent1, parent2 = random.sample(population, 2)
            child = crossover(parent1, parent2)
            child = mutate(child)
            next_generation.append(child)

        population = next_generation

        # Display best individual of the generation
        best_individual = max(population, key=fitness)
        print(f"Generation {generation + 1}: Best Engagement Score = {fitness(best_individual):.2f}")

    # Return the best individual
    return max(population, key=fitness)

# Example usage of the genetic algorithm
optimized_strategy = genetic_algorithm(
    models=models,
    follower_count=1000,
    post_type='image',
    generations=5,
    population_size=20,
    mutation_rate=0.1
)

# Display the optimized strategy
print("\nOptimized Strategy:")
print(f"Hashtags: {optimized_strategy['hashtags']}")
print(f"Hour: {optimized_strategy['hour']}")
print(f"Day: {optimized_strategy['day']}")


Generation 1: Best Engagement Score = 4.09
Generation 2: Best Engagement Score = 4.10
Generation 3: Best Engagement Score = 4.03
Generation 4: Best Engagement Score = 4.03
Generation 5: Best Engagement Score = 4.03

Optimized Strategy:
Hashtags: ['poor', 'station', 'allow']
Hour: 7
Day: 3
