In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# import full dataset
df = pd.read_csv('../raw_data/all_data.csv')
df.head()

Unnamed: 0,Username,Follower count:,Post ID,Media Type:,Caption,Likes,Comments,Timestamp,Tags
0,newbalance,7074908,3.11266e+18,2,Together in sport. 2023. #WeGotNow,26534,333,2023-05-28 12:58:40+00:00,5
1,newbalance,7074908,3.08876e+18,2,IU. Seoul. 2023. #WeGotNow,53742,1100,2023-04-25 13:14:48+00:00,1
2,newbalance,7074908,3.05245e+18,2,"Same game, just played his way. Shohei Ohtani....",357969,1678,2023-03-06 11:03:46+00:00,2
3,newbalance,7074908,3.15546e+18,8,Introducing the debut collaborative project fr...,28796,173,2023-07-26 14:05:00+00:00,2
4,newbalance,7074908,3.14892e+18,8,Ready to take on the world.,5289,68,2023-07-17 13:32:36+00:00,6


In [3]:
#all columns
df.columns

Index(['Username', 'Follower count:', 'Post ID', 'Media Type:', 'Caption',
       'Likes', 'Comments', 'Timestamp', 'Tags'],
      dtype='object')

In [4]:
#missing values in features 
missing = pd.concat([df.isnull().sum(), 100 * df.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count')

Unnamed: 0,count,%
Username,0,0.0
Follower count:,0,0.0
Post ID,0,0.0
Media Type:,0,0.0
Caption,0,0.0
Likes,0,0.0
Comments,0,0.0
Timestamp,0,0.0
Tags,0,0.0


Process Captions

In [5]:
#cleaning text 
def clean_text(text):
    
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(filtered_tokens)


In [6]:
clean_captions = []
for caption in df['Caption']:
    hi = clean_text(caption)
    clean_captions.append(hi)


In [7]:
# add clean_captions to the dataframe 
df['clean_captions'] = clean_captions

Considerations:
- should this be processed for synoymns
- should numbers be removed?
- should an array of the emojis used in each caption be added and encoded
- there are a small handful of text in a different language... how should they be handled? 
    - ignore all together
    - translate manually 
    - try and translate with goog trans?

Process Timestamps

In [8]:
#check data type of the values in the Timestamp series
df['Timestamp'].dtype

dtype('O')

In [9]:
#convert to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Timestamp'].dtype

datetime64[ns, UTC]

In [10]:
#extract dayofweek, hour, and month from the post 
df['hour'] = df['Timestamp'].dt.hour
df['dayofweek'] = df['Timestamp'].dt.dayofweek
df['month'] = df['Timestamp'].dt.month

In [11]:
df.head()

Unnamed: 0,Username,Follower count:,Post ID,Media Type:,Caption,Likes,Comments,Timestamp,Tags,clean_captions,hour,dayofweek,month
0,newbalance,7074908,3.11266e+18,2,Together in sport. 2023. #WeGotNow,26534,333,2023-05-28 12:58:40+00:00,5,together sport 2023 wegotnow,12,6,5
1,newbalance,7074908,3.08876e+18,2,IU. Seoul. 2023. #WeGotNow,53742,1100,2023-04-25 13:14:48+00:00,1,iu seoul 2023 wegotnow,13,1,4
2,newbalance,7074908,3.05245e+18,2,"Same game, just played his way. Shohei Ohtani....",357969,1678,2023-03-06 11:03:46+00:00,2,game played way shohei ohtani 2023 wegotnow 同じ...,11,0,3
3,newbalance,7074908,3.15546e+18,8,Introducing the debut collaborative project fr...,28796,173,2023-07-26 14:05:00+00:00,2,introducing debut collaborative project actres...,14,2,7
4,newbalance,7074908,3.14892e+18,8,Ready to take on the world.,5289,68,2023-07-17 13:32:36+00:00,6,ready take world,13,0,7


In [12]:
# save the data to a new csv file

from library.sb_utils import save_file

datapath = '../data'
save_file(df, 'cleaned_insta.csv', datapath)

Writing file.  "../data/cleaned_insta.csv"
