#Model

In [116]:
# Loading the required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.feature_selection import SelectKBest, chi2 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
import nltk
nltk.download('stopwords')
# %matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import datetime
import time


[nltk_data] Downloading package stopwords to /Users/omkar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [117]:
# Loading the dataset formed by scraping Instagram throgh Instaloader
df = pd.read_excel('newdata1.xlsx')

In [118]:
# Converting the caption to string type
df['caption'] = df['caption'].astype(str)

In [119]:
df.shape

(1000, 12)

In [120]:
df.columns

Index(['username', 'image_url', 'following', 'followers', 'datetime',
       'type_media', 'likes', 'comments', 'caption', 'caption_hashtags',
       'comments_text', 'category'],
      dtype='object')

In [121]:
# Dropping the rows with null values
df = df[df.category.isnull() == False]

In [122]:
df.isnull().sum()

username            0
image_url           0
following           0
followers           0
datetime            0
type_media          0
likes               0
comments            0
caption             0
caption_hashtags    0
comments_text       0
category            0
dtype: int64

In [123]:
# Descriptive statistics of the dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
following,1000.0,1094.335,1669.022987,0.0,109.25,397.0,1087.75,7504.0
followers,1000.0,10122.6,85832.062212,4.0,307.0,957.0,3822.25,2351783.0
likes,1000.0,328.383,1108.675506,1.0,23.75,54.0,198.25,22090.0
comments,1000.0,10.835,32.36766,2.0,2.0,4.0,8.25,724.0
category,1000.0,0.349,0.476892,0.0,0.0,0.0,1.0,1.0


In [124]:
# Value counts of positives(1) and negatives(0)
df['category'].value_counts()

0    651
1    349
Name: category, dtype: int64

In [125]:
# Cleaning the text data by stripping blank spaces, removing unnecessary tokens.
# Text pre-processing : Removing stopwords and stemming

stemmer = PorterStemmer()
words = stopwords.words("english")
df['caption'] = df['caption'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z0-9]", " ", x).split() if i not in words]).lower())

# Using TFIDF Vectorizer to form numerical features
vectorizer = TfidfVectorizer(min_df=3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))

In [126]:
# Selecting top 1000 best features using SelectKBest and storing it in an array
pipe = Pipeline([('vect', vectorizer), ('chi',  SelectKBest(chi2, k=1000))])
text_features = pipe.fit_transform(df['caption'], df['category'])
text_features = text_features.toarray()

In [127]:
text_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [128]:
d1 = datetime.datetime.now()
d1_ts = time.mktime(d1.timetuple())
df['datetime_diff'] = (df['datetime'].apply(lambda x : time.mktime(x.timetuple())) - d1_ts)/60
df['datetime_diff']=(df['datetime_diff']-df['datetime_diff'].min())/(df['datetime_diff'].max()-df['datetime_diff'].min())
df['datetime_diff']

0      1.000000
1      0.999706
2      0.999691
3      0.999657
4      0.999576
5      0.999463
6      0.999387
7      0.999370
8      0.999365
9      0.999187
10     0.999114
11     0.999079
12     0.999077
13     0.999073
14     0.998886
15     0.998881
16     0.998876
17     0.998875
18     0.998829
19     0.998797
20     0.998768
21     0.998758
22     0.998729
23     0.998717
24     0.998695
25     0.998623
26     0.998584
27     0.998525
28     0.998514
29     0.998511
         ...   
970    0.635304
971    0.635294
972    0.635291
973    0.635245
974    0.635220
975    0.635176
976    0.635148
977    0.635147
978    0.635139
979    0.635074
980    0.635024
981    0.635019
982    0.635006
983    0.634979
984    0.634961
985    0.634950
986    0.634945
987    0.634937
988    0.634935
989    0.634934
990    0.634926
991    0.634910
992    0.634896
993    0.634742
994    0.634707
995    0.634614
996    0.634573
997    0.634544
998    0.634494
999    0.634484
Name: datetime_diff, Len

In [129]:
df1 = df[['following','followers', 'likes', 'comments', "datetime_diff"]]
#df1

In [137]:
# Categorising the target variable
df['category'] = pd.Categorical(df['category'])

In [153]:
# Creating training and test datasets
#X = text_features
X = df1
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [154]:
# Fitting the model
# Linear SVC is used since the data is sparsed and Support vectors work well on such kind of dataset
model = LinearSVC(class_weight='balanced', max_iter=5000)
model.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=5000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [155]:
# Performing prediction on test dataset
preds = model.predict(X_test)
print('Final prediction score: [%.8f]' % accuracy_score(y_test, preds))
print('Final prediction f1 score: [%.8f]' % f1_score(y_test, preds, average='weighted'))

Final prediction score: [0.97000000]
Final prediction f1 score: [0.96983631]


In [156]:
# Printing classification report and confusion matrix
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       143
           1       0.96      0.93      0.95        57

   micro avg       0.97      0.97      0.97       200
   macro avg       0.97      0.96      0.96       200
weighted avg       0.97      0.97      0.97       200

[[141   2]
 [  4  53]]


In [157]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Performing prediction on test dataset
preds = model.predict(X_test)
print('Final prediction score: [%.8f]' % accuracy_score(y_test, preds))
print('Final prediction f1 score: [%.8f]' % f1_score(y_test, preds, average='weighted'))

Final prediction score: [0.95500000]
Final prediction f1 score: [0.95533841]


In [158]:
model = clf = LogisticRegressionCV(cv=5, random_state=0, multi_class='multinomial')
model.fit(X_train, y_train)

# Performing prediction on test dataset
preds = model.predict(X_test)
print('Final prediction score: [%.8f]' % accuracy_score(y_test, preds))
print('Final prediction f1 score: [%.8f]' % f1_score(y_test, preds, average='weighted'))

Final prediction score: [0.98500000]
Final prediction f1 score: [0.98495976]


In [159]:
import xgboost as xgb

In [160]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [161]:
preds = xgb_model.predict(X_test)
print('Final prediction score: [%.8f]' % accuracy_score(y_test, preds))
print('Final prediction f1 score: [%.8f]' % f1_score(y_test, preds, average='weighted'))

Final prediction score: [1.00000000]
Final prediction f1 score: [1.00000000]
