In [42]:
#import necessary libraries
import pandas as pd #to handle dataframe
import numpy as np #calculations on arrays
import matplotlib.pyplot as plt #for plotting
import seaborn as sns #for higher-lvl interface for more aesthetic🌸 and informative statistical graph (builds on top of matplotlib)

import joblib #for savin/loading python objects (ML model)
import os #to interact with OS
import warnings

from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score


In [43]:
#load dataset
df= pd.read_csv('instagram_reach.csv',na_values='na')
df.head()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


In [44]:
df.columns

Index(['Unnamed: 0', 'S.No', 'USERNAME', 'Caption', 'Followers', 'Hashtags',
       'Time since posted', 'Likes'],
      dtype='object')

In [45]:
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         100 non-null    int64 
 1   S.No               100 non-null    int64 
 2   USERNAME           100 non-null    object
 3   Caption            94 non-null     object
 4   Followers          100 non-null    int64 
 5   Hashtags           100 non-null    object
 6   Time since posted  100 non-null    object
 7   Likes              100 non-null    int64 
dtypes: int64(4), object(4)
memory usage: 6.4+ KB


In [46]:
df = df.drop(['Unnamed: 0', 'S.No', 'USERNAME','Caption', 'Hashtags'], axis=1)
df.head()

Unnamed: 0,Followers,Time since posted,Likes
0,1600,11 hours,139
1,880,2 hours,23
2,255,2 hours,25
3,340,3 hours,49
4,304,3 hours,30


In [47]:
df.columns

Index(['Followers', 'Time since posted', 'Likes'], dtype='object')

In [48]:
#NOTE: dtype '0' represents 'object' type, typically used for strings or categorical data
#define numerical & categorical columns from dataset
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

#print columns (like just separate by their type)
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 2 numerical features : ['Followers', 'Likes']

We have 1 categorical features : ['Time since posted']


In [49]:
#for regression model, we'll have to convert all columns into numerical values => convert 'TIme since posted'
#df['Time since posted'] = df['Time since posted'].astype(str).str.replace(' hours', '').astype(float)
import re #provides regular expression operations for pattern matching and manipulation of strings
# removing hours and typecasting to int
df['Time since posted'] = df['Time since posted'].map(lambda a: int(re.sub('hours', '', a)))

In [50]:
#we'll just re-check dtypes
df.dtypes
df.head()

Unnamed: 0,Followers,Time since posted,Likes
0,1600,11,139
1,880,2,23
2,255,2,25
3,340,3,49
4,304,3,30


In [51]:
#splitting features and targets
X = df.drop(['Likes', 'Time since posted'], axis=1)
y = df[['Likes', 'Time since posted']]

In [54]:
#train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#final model
final_model = RandomForestRegressor(
    n_estimators=1000,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

#train the model
final_model.fit(X_train, y_train)

#make predictions
y_pred = final_model.predict(X_test)

#evaluate the model
train_score = final_model.score(X_train, y_train)
test_score = r2_score(y_test, y_pred)
print('Training Score: ', train_score)
print('Testing Score: ', test_score)


Training Score:  0.7948775072754012
Testing Score:  0.664275022345368


In [55]:
#save the trained model and download it
joblib.dump(final_model, 'instagram_engagement_model.pkl')

#download
from google.colab import files
files.download('instagram_engagement_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>