In [58]:
#import useful libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OneHotEncoder
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Model training to predict the number of likes

In [59]:
#load the data
df = pd.read_csv("data/preprocessed_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3,30


In [60]:
# Select the relevant features and target variables

features = ['USERNAME', 'Caption', 'Hashtags', 'Followers','Time since posted']
target_likes = 'Likes'
target_time_since_posted = 'Time since posted'

In [61]:
# Split the data into training and testing sets

X = df[features]
y_likes = df[target_likes]
# y_time_since_posted = df[target_time_since_posted]

X_train,X_test,y_likes_train,y_likes_test= train_test_split(X, y_likes, test_size=0.2, random_state=42)

In [62]:
print("X_train",X.shape)
print("X_test",X_test.shape)

X_train (94, 5)
X_test (19, 5)


In [88]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns
print(categorical_cols)
print(numerical_cols)

Index(['USERNAME', 'Caption', 'Hashtags'], dtype='object')
Index(['Followers', 'Likes'], dtype='object')


In [64]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder',OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [65]:
#  preprocess the data based on pipleline
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())



In [66]:
X_train.head()

Unnamed: 0,num_pipeline__Followers,num_pipeline__Time since posted,cat_pipeline__USERNAME__ehab.othman_,cat_pipeline__USERNAME__linda_smith567,cat_pipeline__USERNAME_ah_studio_,cat_pipeline__USERNAME_aitrading_official,cat_pipeline__USERNAME_ale_borba,cat_pipeline__USERNAME_alpha_mentor_,cat_pipeline__USERNAME_anki7singh,cat_pipeline__USERNAME_ansonmccade,...,cat_pipeline__Hashtags_#weekend #chill #chilling #Summer#founder #startup #smallbusiness #smallbiz#contentmarketing #consulting,cat_pipeline__Hashtags_#whoiswho #aitrading #ai #aitradingteam#instateam #instapeople #ai #trading#artificialintelligence #crypto#cryptocurrency #blockchain #tradingforex#forex #fiatmoney #coins #machinelearning#userexperience #instamachinelearning#instabigdata #instamarketing#artificialintelligence #deeplearning#datascience #industry #marketing#bigdata #datascience #machinelearning#ml,cat_pipeline__Hashtags_#worldcode #coding#python #codeaholics #rstudio #codinglife#worldofprogrammers #datascience#machinelearning #dataviz #data #statistics#macbookpro #peoplewhocode#codeismylife #datavisualization#artificialintelligence #digitalnomads#digitalnomad #travel,cat_pipeline__Hashtags_#youtube #applemusic #itunes#soundcloud #spinrilla #spotify #bigdata#blockchain #dontbandwagonlater,cat_pipeline__Hashtags_#любовь #gm #sme #smenigeria #profits#businessowners #businessplan#entrepreneurship #entrepreneur#blockchain #crypto #cointelegraph#bitcoinprice #mining #cryptocurrencies#bch #bitcoins #litecoin #investment#investor #stockmarket #stocks #getrich#makemoney #makemoneyonline#mentorship #mentoring #xrp #bitfinex#altcoins,cat_pipeline__Hashtags_.#Tech #virtualreality #IoT #Machinelearning,cat_pipeline__Hashtags_[#Infographic] #Wearable #Sensors #MachineLearning#IoT #BigData #DigitalTransformation#futureofwork #marketing #analytics#bigdata #Cloud #Blogging#ContentMarketing #DigitalMarketing ht: #BigData #MachineLearning #AI #IoT#infograp,cat_pipeline__Hashtags_thebeautymindset#businessman#quoteoftheday #businessowner#businesswoman #success #grind#motivation #motivational #lifestyle#happiness #entrepreneurs#entrepreneurlife #business #working #founder#startup #money #magazine #moneymaker#startuplife #successful #passion #inspiredaily#hardwork #hardworkpaysoff #desire,cat_pipeline__Hashtags_ #datascience #data #tech #technology#future #machinelearning #ai#visualizations #dataisbeautiful,cat_pipeline__Hashtags_ #deck .#mac #macintosh#sayhello #apple #stevejobs #ai #evolution#artificialintelligence #machinelearning#terminator #illbeback #technology#computerevolution #computerscience#sciencefiction#computersciencetosciencefiction#tomorrowstechnology #vr #ar #robot#robots #t2 #businessdeck #businessslides#illustration #sketches #drawing
0,-0.768162,-0.388968,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,...,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,8.602325,-0.116248,-0.116248
1,-0.014088,-0.10013,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,...,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248
2,0.196501,-0.388968,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,...,8.602325,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248
3,-0.129958,-0.388968,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,...,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248
4,-0.145591,-0.388968,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,...,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248,-0.116248


In [67]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

##### Training a model to predict the number of likes:

In [68]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [69]:

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_likes_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_likes_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 36.84756629422695
MAE: 29.439111032721605
R2 score 14.937232879251983


Lasso
Model Training Performance
RMSE: 29.295872384603527
MAE: 18.162563644953853
R2 score 46.23065341855032


Ridge
Model Training Performance
RMSE: 36.135333618013746
MAE: 28.318453667285354
R2 score 18.193836481052127


Elasticnet
Model Training Performance
RMSE: 35.506145619304036
MAE: 24.48361976163034
R2 score 21.017850597196265




In [70]:
# training with xg-boost
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [71]:
# fit model no training data
le = LabelEncoder()
ylabel_likes_train = le.fit_transform(y_likes_train)

In [72]:
ylabel_likes_test = le.fit_transform(y_likes_test)

In [73]:
import re

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
X_test.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_test.columns.values]

In [74]:
# fit model no training data

model = XGBRegressor()
model.fit(X_train, y_likes_train)

In [75]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_likes_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 10.53%


In [76]:
#Define model evaluation method

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import absolute
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
#Evaluate the model
scores = cross_val_score(model, X_test, ylabel_likes_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
#Absolute MAE
#Absolute MAE
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()))

Mean MAE: 4.680 (2.009)


##### So in case of no of likes in this model Lasso Regression gives the max accuracy with 47% and not XG-Boost

In [None]:

## training model for no of times since posted

## Model training to predict the number of times since posted

In [81]:
# Select the relevant features and target variables
# Split the data into training and testing sets

features = ['USERNAME', 'Caption', 'Hashtags', 'Followers','Likes']
target_time_since_posted = 'Time since posted'
X = df[features]
y_time_since_posted=df[target_time_since_posted]
X_train,X_test,y_time_since_posted_train,y_time_since_posted_test= train_test_split(X, y_time_since_posted, test_size=0.2, random_state=42)

In [83]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [84]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder',OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [85]:
#  preprocess the data based on pipleline
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())



In [86]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [87]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_likes_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_likes_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 32.62415211527379
MAE: 25.27502083871913
R2 score 33.31926851253198


Lasso
Model Training Performance
RMSE: 0.671712185463488
MAE: 0.5391068397197667
R2 score 99.97173243872156


Ridge
Model Training Performance
RMSE: 32.14260965861128
MAE: 24.330824888894572
R2 score 35.27319696166503


Elasticnet
Model Training Performance
RMSE: 31.208043589928256
MAE: 20.194325289248773
R2 score 38.982420397798535




##### Hence the model Lasso Regression gives 99.9% accuray for the prediction of no of time since posted