In [1]:
data_url = 'https://towardsdatascience.com/predict-the-number-of-likes-on-instagram-a7ec5c020203'

In [2]:
# In this section:
    
# prepare NLP analysis
# prepare TF-IDF for important words

In [3]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [6]:
from nltk.corpus import stopwords

In [7]:
pd.set_option('display.max_colwidth', 150)

# expand column width for NLP analysis

In [8]:
iorig = pd.read_csv('insta_edit.csv',index_col=0)

print(iorig.shape)

iorig.head()

(29955, 14)


Unnamed: 0,USERNAME,FOLLOWERS,FOLLOWING,LIKES,COMMENTS,TEXT,NUMBER OF TAGS,LIST OF TAGS,DATE,PHOTOVIDEO,USERS IN PHOTO,LINK,LOCATION,WEEKDAYMS
0,georgiou82,1134619,915,18560,95,Every day is a #newday and every experience is a #newera,2,#newera#newday,2017-02-11 11:05:10,1,1,https://www.instagram.com/p/BQXZpSQjiPj,"33.0,35.0",5
1,georgiou82,1134619,915,17710,86,#goodnight !!! The #weekend is ahead!!! Have a #goodone guys!!!!,3,#goodnight#goodone#weekend,2017-02-11 00:33:10,1,1,https://www.instagram.com/p/BQWRUV6j8b-,"33.3612275609,35.1713765464",5
2,georgiou82,1134619,915,11959,79,#tb #2007 #stous31dromous #megatv #greece #newyork #memories,7,#tb#memories#greece#stous31dromous#newyork#2007#megatv,2017-02-09 15:51:47,1,3,https://www.instagram.com/p/BQSw2wMj0Bw,"-74.0064,40.7142",3
3,georgiou82,1134619,915,13608,37,Cold weather brings people together.... #filming #mprousko4 with director @stamosts and DOP @kinolis2.8,2,#mprousko4#filming,2017-02-08 18:21:57,1,3,https://www.instagram.com/p/BQQdPrLDeRz,,2
4,georgiou82,1134619,915,15842,62,The youngest hearts might be the oldest souls.... #lifelessons #andreasgeorgiou✔️ #tb #peru,4,#peru#lifelessons#tb#andreasgeorgiou✔️,2017-02-08 13:03:14,1,0,https://www.instagram.com/p/BQP4xY7jAXb,"-71.9781,-13.5183",2


Pre-process text data

In [9]:
# create text data series

iedit = iorig['TEXT']

print(type(iedit))

iedit.head()

<class 'pandas.core.series.Series'>


0                                                   Every day is a #newday and every experience is a #newera
1                                           #goodnight !!! The #weekend is ahead!!! Have a #goodone guys!!!!
2                                               #tb #2007 #stous31dromous #megatv #greece #newyork #memories
3    Cold weather brings people together.... #filming #mprousko4 with director @stamosts and DOP @kinolis2.8
4                The youngest hearts might be the oldest souls.... #lifelessons #andreasgeorgiou✔️ #tb #peru
Name: TEXT, dtype: object

In [10]:
# create copy of text for comparison

icopy = iorig['TEXT']

In [11]:
# lowercase

iedit = iedit.apply(lambda x:
                    " ".join(x.lower()
                    for x in str(x).split()))
iedit.head()

0                                                   every day is a #newday and every experience is a #newera
1                                           #goodnight !!! the #weekend is ahead!!! have a #goodone guys!!!!
2                                               #tb #2007 #stous31dromous #megatv #greece #newyork #memories
3    cold weather brings people together.... #filming #mprousko4 with director @stamosts and dop @kinolis2.8
4                the youngest hearts might be the oldest souls.... #lifelessons #andreasgeorgiou✔️ #tb #peru
Name: TEXT, dtype: object

In [12]:
# remove special characters
# will revisit hashtags; for now merging hashtag and reg word

iedit = iedit.str.replace('[^\w\s]', ' ')

iedit.head()

0                                                   every day is a  newday and every experience is a  newera
1                                            goodnight     the  weekend is ahead    have a  goodone guys    
2                                                tb  2007  stous31dromous  megatv  greece  newyork  memories
3    cold weather brings people together      filming  mprousko4 with director  stamosts and dop  kinolis2 8
4                the youngest hearts might be the oldest souls      lifelessons  andreasgeorgiou    tb  peru
Name: TEXT, dtype: object

In [13]:
# remove stopwords

stop = stopwords.words('english')

iedit = iedit.apply(lambda x:
                    " ".join(x for x in str(x).split()
                    if x not in stop))
iedit.head()

0                                                  every day newday every experience newera
1                                                      goodnight weekend ahead goodone guys
2                                     tb 2007 stous31dromous megatv greece newyork memories
3    cold weather brings people together filming mprousko4 director stamosts dop kinolis2 8
4                    youngest hearts might oldest souls lifelessons andreasgeorgiou tb peru
Name: TEXT, dtype: object

In [14]:
# visualize frequent words

print( 'Unique words: ' + str(pd.Series(' '.join(iedit).split()).value_counts().count()))

freq_words = pd.Series(' '.join(iedit).split()).value_counts()[:10]

freq_words

# we see a huge number of Greek words; to remove

Unique words: 75692


και       5086
το        4428
να        3554
με        3036
στο       2817
για       2671
την       2646
greece    2517
love      2358
η         2207
dtype: int64

In [15]:
# remove Greek stopwords

gstop = stopwords.words('greek')

iedit = iedit.apply(lambda x:
                    " ".join(x for x in str(x).split()
                    if x not in gstop))
iedit.head()

0                                                  every day newday every experience newera
1                                                      goodnight weekend ahead goodone guys
2                                     tb 2007 stous31dromous megatv greece newyork memories
3    cold weather brings people together filming mprousko4 director stamosts dop kinolis2 8
4                    youngest hearts might oldest souls lifelessons andreasgeorgiou tb peru
Name: TEXT, dtype: object

In [16]:
infreq_words = pd.Series(' '.join(iedit).split()).value_counts()[-10:]

infreq_words

alexanderwangny    1
ermis_soumas       1
leontios           1
филиппины          1
διάπλατα           1
μακροχρόνια        1
happy2gether       1
cobrarà            1
zetapsisgrho       1
σπιτοπαρεαμεθεα    1
dtype: int64

In [17]:
rare_words = pd.Series(' '.join(iedit).split()).value_counts()

rare_words = rare_words[rare_words <= 2]

print( 'Unique rare words: ' + str(rare_words.count()))

Unique rare words: 54380


In [18]:
# remove rare words (2 occurances or less)

iedit = iedit.apply(lambda x:
                    " ".join(x for x in str(x).split()
                    if x not in rare_words))
iedit.head()

0                                                every day newday every experience
1                                                     goodnight weekend ahead guys
2                                                  tb 2007 greece newyork memories
3    cold weather brings people together filming mprousko4 director dop kinolis2 8
4                                       hearts might souls andreasgeorgiou tb peru
Name: TEXT, dtype: object

In [19]:
print( 'Remaining unique words: ' + str(pd.Series(' '.join(iedit).split()).value_counts().count()))

Remaining unique words: 21236


In [36]:
int_words = pd.Series(' '.join(iedit).split()).value_counts()

int_words = int_words[inte_words.str.contains]

# print( 'Unique rare words: ' + str(rare_words.count()))

In [37]:
int_words.head()

greece     2517
love       2358
της        1710
μου        1621
fashion    1429
dtype: int64

In [61]:
# find all words that contain integers

has_int = []

for word,count in int_words.iteritems():
    if any(char.isdigit() for char in word):
        has_int.append(word)
        
print(len(has_int))
print(has_int[:10])

794
['00', '2017', '1', '2', '4', 'like4like', '3', '5', '90', 'follow4follow']


In [62]:
# remove words that contain integers

iedit = iedit.apply(lambda x:
                    " ".join(x for x in str(x).split()
                    if x not in has_int))
iedit.head()

0                           every day newday every experience
1                                goodnight weekend ahead guys
2                                  tb greece newyork memories
3    cold weather brings people together filming director dop
4                  hearts might souls andreasgeorgiou tb peru
Name: TEXT, dtype: object

In [63]:
print( 'Remaining unique words: ' + str(pd.Series(' '.join(iedit).split()).value_counts().count()))

Remaining unique words: 20442


In [70]:
iorig['NEWTEXT'] = iedit

iorig.head()

Unnamed: 0,USERNAME,FOLLOWERS,FOLLOWING,LIKES,COMMENTS,TEXT,NUMBER OF TAGS,LIST OF TAGS,DATE,PHOTOVIDEO,USERS IN PHOTO,LINK,LOCATION,WEEKDAYMS,NEWTEXT
0,georgiou82,1134619,915,18560,95,Every day is a #newday and every experience is a #newera,2,#newera#newday,2017-02-11 11:05:10,1,1,https://www.instagram.com/p/BQXZpSQjiPj,"33.0,35.0",5,every day newday every experience
1,georgiou82,1134619,915,17710,86,#goodnight !!! The #weekend is ahead!!! Have a #goodone guys!!!!,3,#goodnight#goodone#weekend,2017-02-11 00:33:10,1,1,https://www.instagram.com/p/BQWRUV6j8b-,"33.3612275609,35.1713765464",5,goodnight weekend ahead guys
2,georgiou82,1134619,915,11959,79,#tb #2007 #stous31dromous #megatv #greece #newyork #memories,7,#tb#memories#greece#stous31dromous#newyork#2007#megatv,2017-02-09 15:51:47,1,3,https://www.instagram.com/p/BQSw2wMj0Bw,"-74.0064,40.7142",3,tb greece newyork memories
3,georgiou82,1134619,915,13608,37,Cold weather brings people together.... #filming #mprousko4 with director @stamosts and DOP @kinolis2.8,2,#mprousko4#filming,2017-02-08 18:21:57,1,3,https://www.instagram.com/p/BQQdPrLDeRz,,2,cold weather brings people together filming director dop
4,georgiou82,1134619,915,15842,62,The youngest hearts might be the oldest souls.... #lifelessons #andreasgeorgiou✔️ #tb #peru,4,#peru#lifelessons#tb#andreasgeorgiou✔️,2017-02-08 13:03:14,1,0,https://www.instagram.com/p/BQP4xY7jAXb,"-71.9781,-13.5183",2,hearts might souls andreasgeorgiou tb peru


TF-IDF

In [64]:
# Manual TF-IDF

tf_tx = iedit.apply(lambda x: 
        pd.value_counts(x.split(" "))
        ).sum(axis=0).reset_index()

tf_tx.columns = ['words','tf']

tf_tx.head(10)

Unnamed: 0,words,tf
0,,2320.0
1,_,100.0
2,__,4.0
3,___,9.0
4,____,4.0
5,_____,5.0
6,_______,3.0
7,________,18.0
8,_________,6.0
9,__________,24.0


In [71]:
for i,word in enumerate(tf_tx['words']):
    tf_tx.loc[i, 'idf'] = np.log(iorig.shape[0]/
                        (len(iorig[iorig['NEWTEXT'].str.contains(word)])))
tf_tx.head(10)

Unnamed: 0,words,tf,idf
0,,2320.0,0.0
1,_,100.0,1.80883
2,__,4.0,3.72128
3,___,9.0,3.927329
4,____,4.0,3.934132
5,_____,5.0,3.939264
6,_______,3.0,3.944423
7,________,18.0,3.947878
8,_________,6.0,3.960062
9,__________,24.0,3.968857


In [72]:
tf_tx['tfidf'] = tf_tx['tf'] * tf_tx['idf']

tf_tx.head()

Unnamed: 0,words,tf,idf,tfidf
0,,2320.0,0.0,0.0
1,_,100.0,1.80883,180.882973
2,__,4.0,3.72128,14.88512
3,___,9.0,3.927329,35.345961
4,____,4.0,3.934132,15.736527


In [73]:
tf_sort = tf_tx.sort_values(by='tfidf',ascending=False)

tf_sort.head(20)

Unnamed: 0,words,tf,idf,tfidf
5051,greece,2517.0,2.265395,5701.999528
19242,της,1710.0,3.163044,5408.805846
17423,μου,1621.0,3.10852,5038.911396
17236,μας,1428.0,3.281025,4685.303308
528,anime,1201.0,3.842863,4615.27874
7344,love,2358.0,1.944109,4584.209182
1801,burakozcivit,1177.0,3.718525,4376.703992
14839,από,1219.0,3.222387,3928.090046
18695,σας,1118.0,3.502837,3916.171782
15695,είναι,1005.0,3.653299,3671.565509


In [74]:
# save TF-IDF data to CSV due to long process time

try:
    tf_tx.to_csv('insta_tfidf.csv',mode='x')
    print('File saved')
    
except:
    print('File exists')

File saved


In [None]:
# will attempt NLP ML in part 04