 Imagine you have a dataset where you have different Instagram features like username , Caption , Hashtag , Followers , Time_Since_posted , and likes , now your task is to predict the number of likes and Time Since posted and the rest of the features are your input features. Now you have to build a model which can predict the number of likes and Time Since posted.

In [1]:
import pandas as pd
import warnings
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle
import numpy as np
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../data/instagram_reach.csv")
df.shape

(100, 8)

In [3]:
df['text'] = df['Caption']+df['Hashtags']
df['text'] = df['text'].astype(str)
df['text']

0     Who are #DataScientist and what do they do? >>...
1     We all know where it’s going. We just have to ...
2     Alexander Barinov: 4 years as CFO in multinati...
3     sfad#iot #cre#workplace #CDO #bigdata #technol...
4     Ever missed a call while your phone was chargi...
                            ...                        
95    328 S. Wetherly Drive, Beverly Hills, CA 90212...
96    Credit @tristankappel To find more dvlp follow...
97    We are coming up with the Best 21 Books that w...
98    We’re only paid to move dirt once. It’s not ju...
99    Obtén tu tienda en línea ahora.#marketing #pro...
Name: text, Length: 100, dtype: object

In [4]:
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(len(df)):
    text = re.sub('[^a-zA-Z]', '', df['text'][i])
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [lemmatizer.lemmatize(word) for word in text if not word in set(
        stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

In [5]:
cv = CountVectorizer(max_features=7000)
X_BOW = cv.fit_transform(corpus)

In [6]:
from scipy.sparse import csr_matrix
# Create a DataFrame from the sparse matrix
df_sparse = pd.DataFrame.sparse.from_spmatrix(X_BOW)

In [7]:
# Append the sparse DataFrame to the existing DataFrame
df_combined = pd.concat([df, df_sparse], axis=1, ignore_index=True)
df_combined

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139,Who are #DataScientist and what do they do? >>...,0,...,0,0,0,0,0,0,0,0,1,0
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23,We all know where it’s going. We just have to ...,0,...,0,0,0,0,1,0,0,0,0,0
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25,Alexander Barinov: 4 years as CFO in multinati...,0,...,0,0,0,0,0,0,0,0,0,0
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49,sfad#iot #cre#workplace #CDO #bigdata #technol...,0,...,0,0,0,0,0,0,0,0,0,0
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30,Ever missed a call while your phone was chargi...,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,8,19,michaelgarza__,"328 S. Wetherly Drive, Beverly Hills, CA 90212...",614,#beverlyhills #realestate#losangelesrealestate...,3 hours,31,"328 S. Wetherly Drive, Beverly Hills, CA 90212...",0,...,0,0,0,0,0,0,0,0,0,0
96,9,21,dvlp_search,Credit @tristankappel To find more dvlp follow...,450,#workspace #work #developer#development #devel...,3 hours,42,Credit @tristankappel To find more dvlp follow...,0,...,0,0,0,0,0,0,0,0,0,0
97,10,22,ecom.space,We are coming up with the Best 21 Books that w...,182,#books #book #motivation #inspiration #life#bo...,3 hours,10,We are coming up with the Best 21 Books that w...,0,...,0,0,0,0,0,1,0,0,0,0
98,11,24,lb3enterprises,We’re only paid to move dirt once. It’s not ju...,2039,#heavyequipment #underconstruction#dozer #real...,3 hours,222,We’re only paid to move dirt once. It’s not ju...,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
X = df_combined.drop([0,1,2,3,5,6,7,8],axis=1)
X

Unnamed: 0,4,9,10,11,12,13,14,15,16,17,...,94,95,96,97,98,99,100,101,102,103
0,1600,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,880,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,255,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,340,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,304,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,614,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,450,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,182,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
98,2039,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
y1 = df_combined[7]

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.20, random_state=1)

In [17]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_predict
def train_model(model):
    m = model[1]
    y_train_pred = cross_val_predict(model[1], X_train, y_train, cv=5)
    mae = mean_absolute_error(y_train,y_train_pred)
    mse = mean_squared_error(y_train,y_train_pred)
    rmse = np.sqrt(mse)
    rmsle = np.log(rmse)
    r2 = r2_score(y_train,y_train_pred)
    print(f'{model[0]} MAE: {mae}')
    print(f'{model[0]} MSE: {mse}')
    print(f'{model[0]} RMSE: {rmse}')
    print(f'{model[0]} RMSLE: {rmsle}')
    print(f'{model[0]} R2 score: {r2}')

In [18]:
from sklearn.linear_model import LinearRegression
train_model(('Linear Regression', LinearRegression()))

Linear Regression MAE: 35.14715329105792
Linear Regression MSE: 3386.3203992113035
Linear Regression RMSE: 58.19209911329289
Linear Regression RMSLE: 4.063749591454842
Linear Regression R2 score: -0.008205527995839068


In [19]:
from sklearn.linear_model import Ridge
train_model(('Ridge Regression', Ridge()))

Ridge Regression MAE: 34.320586014097316
Ridge Regression MSE: 3302.034772192141
Ridge Regression RMSE: 57.46333415485166
Ridge Regression RMSLE: 4.05114707753517
Ridge Regression R2 score: 0.016888741025812504


In [20]:
from sklearn.neural_network import MLPRegressor
train_model(('MLPR Regression', MLPRegressor(random_state=1, max_iter=500)))

MLPR Regression MAE: 45.735855135539076
MLPR Regression MSE: 5160.21311850572
MLPR Regression RMSE: 71.83462339642159
MLPR Regression RMSLE: 4.274366579829348
MLPR Regression R2 score: -0.5363446981938933


In [21]:
from sklearn.linear_model import Lasso
train_model(('Lasso Regression', Lasso()))

Lasso Regression MAE: 31.333788707147427
Lasso Regression MSE: 3337.3206803127373
Lasso Regression RMSE: 57.76954803625122
Lasso Regression RMSLE: 4.056461786259908
Lasso Regression R2 score: 0.00638310557683841


In [22]:
from sklearn.tree import DecisionTreeRegressor
train_model(('Decision Tree Regression', DecisionTreeRegressor()))

Decision Tree Regression MAE: 30.5375
Decision Tree Regression MSE: 3902.0375
Decision Tree Regression RMSE: 62.46629090957778
Decision Tree Regression RMSLE: 4.1346270657965265
Decision Tree Regression R2 score: -0.1617494253831775


In [23]:
from sklearn.ensemble import RandomForestRegressor
train_model(('Random Forest Regression', RandomForestRegressor()))

Random Forest Regression MAE: 29.194624999999995
Random Forest Regression MSE: 3744.59291375
Random Forest Regression RMSE: 61.19307896935731
Random Forest Regression RMSLE: 4.114034094387275
Random Forest Regression R2 score: -0.11487361816563291


In [24]:
from sklearn.neighbors import KNeighborsRegressor
train_model(('KNN Regression', KNeighborsRegressor()))

KNN Regression MAE: 35.6
KNN Regression MSE: 4024.925
KNN Regression RMSE: 63.442296616689404
KNN Regression RMSLE: 4.150130778069342
KNN Regression R2 score: -0.19833658850290026


In [25]:
from sklearn.svm import SVR
train_model(('SVM Regression', SVR()))

SVM Regression MAE: 29.51655270252935
SVM Regression MSE: 3706.765065688812
SVM Regression RMSE: 60.88320840501765
SVM Regression RMSLE: 4.108957412641446
SVM Regression R2 score: -0.10361117367385964


In [26]:
from sklearn.gaussian_process import GaussianProcessRegressor
train_model(('Gaussian Regression', GaussianProcessRegressor()))

Gaussian Regression MAE: 46.79419760757668
Gaussian Regression MSE: 5585.318362841141
Gaussian Regression RMSE: 74.73498754158685
Gaussian Regression RMSLE: 4.313948356536578
Gaussian Regression R2 score: -0.6629108250786426


In [27]:
from sklearn.model_selection import GridSearchCV

# Define the Ridge model
ridge = Ridge()

# Define the hyperparameter grid
param_grid = {
    'alpha': [0.1, 1.0, 10.0],    # Different values of alpha
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag']  # Different solvers
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

Best Hyperparameters:
{'alpha': 1.0, 'solver': 'auto'}


In [28]:
# Evaluate the best model
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test,y_pred)
print(f"R2 score: {r2}")

R2 score: 0.2451529300393187


In [29]:
y2 = df_combined[6].replace(r'hours', '', regex=True).astype(int)

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.20, random_state=1)

In [31]:
from sklearn.linear_model import LinearRegression
train_model(('Linear Regression', LinearRegression()))

Linear Regression MAE: 1.8308008904101627
Linear Regression MSE: 11.819952927679003
Linear Regression RMSE: 3.4380158416852886
Linear Regression RMSLE: 1.234894514770079
Linear Regression R2 score: 0.040928815645498884


In [32]:
from sklearn.linear_model import Ridge
train_model(('Ridge Regression', Ridge()))

Ridge Regression MAE: 1.8453811050283815
Ridge Regression MSE: 11.633373826746631
Ridge Regression RMSE: 3.410773200719542
Ridge Regression RMSLE: 1.2269390106716898
Ridge Regression R2 score: 0.0560678471121957


In [33]:
from sklearn.neural_network import MLPRegressor
train_model(('MLPR Regression', MLPRegressor(random_state=1, max_iter=500)))

MLPR Regression MAE: 17.097004562329516
MLPR Regression MSE: 692.823495584906
MLPR Regression RMSE: 26.321540524538186
MLPR Regression RMSLE: 3.2703876353118297
MLPR Regression R2 score: -55.215710377597745


In [34]:
from sklearn.linear_model import Lasso
train_model(('Lasso Regression', Lasso()))

Lasso Regression MAE: 1.891239561534407
Lasso Regression MSE: 11.81445812035196
Lasso Regression RMSE: 3.4372166240072737
Lasso Regression RMSLE: 1.2346620229421725
Lasso Regression R2 score: 0.04137466440675808


In [35]:
from sklearn.tree import DecisionTreeRegressor
train_model(('Decision Tree Regression', DecisionTreeRegressor()))

Decision Tree Regression MAE: 1.4875
Decision Tree Regression MSE: 14.0125
Decision Tree Regression RMSE: 3.7433273968489584
Decision Tree Regression RMSLE: 1.3199748941991392
Decision Tree Regression R2 score: -0.13697449160707964


In [36]:
from sklearn.ensemble import RandomForestRegressor
train_model(('Random Forest Regression', RandomForestRegressor()))

Random Forest Regression MAE: 1.4757500000000001
Random Forest Regression MSE: 13.773275000000002
Random Forest Regression RMSE: 3.7112363169165072
Random Forest Regression RMSLE: 1.311365060168593
Random Forest Regression R2 score: -0.11756377098230164


In [37]:
from sklearn.neighbors import KNeighborsRegressor
train_model(('KNN Regression', KNeighborsRegressor()))

KNN Regression MAE: 1.7899999999999998
KNN Regression MSE: 12.383
KNN Regression RMSE: 3.5189487066452103
KNN Regression RMSLE: 1.2581622821164806
KNN Regression R2 score: -0.00475683351082723


In [38]:
from sklearn.svm import SVR
train_model(('SVM Regression', SVR()))

SVM Regression MAE: 1.5052954424545697
SVM Regression MSE: 13.473140339005175
SVM Regression RMSE: 3.67057765740015
SVM Regression RMSLE: 1.3003490495169774
SVM Regression R2 score: -0.09321083941418351


In [39]:
from sklearn.gaussian_process import GaussianProcessRegressor
train_model(('Gaussian Regression', GaussianProcessRegressor()))

Gaussian Regression MAE: 3.4230710302798144
Gaussian Regression MSE: 24.21163279144648
Gaussian Regression RMSE: 4.920531759012077
Gaussian Regression RMSLE: 1.593416605763891
Gaussian Regression R2 score: -0.9645323021610819


In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

# Define the Ridge model
ridge = Ridge()

# Define the hyperparameter grid
param_grid = {
    'alpha': [0.1, 1.0, 10.0],    # Different values of alpha
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag']  # Different solvers
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

Best Hyperparameters:
{'alpha': 1.0, 'solver': 'svd'}


In [41]:
# Evaluate the best model
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test,y_pred)
print(f"R2 score: {r2}")

R2 score: 0.04095064645047264
