# Data Pre-Processing & NLP

In [1]:
import numpy as np
import pandas as pd

from langdetect import detect
import pandas as pd
import re
import unicodedata

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Merging of all post requests

In [2]:
# Load dataset 1
df0 = pd.read_csv('ig_posts.csv')
df1 = pd.read_csv('newsupdate_posts.csv')
df3 = pd.read_csv('newsupdate_posts2.csv')
dftop1= pd.read_csv('newsupdate_posts_top.csv')

# Load dataset 2
df2 = pd.read_csv('fake_posts_complete.csv')

# Add a new column 'dataset' with labels in both DataFrames
df0['is_fake'] = '0'
df1['is_fake'] = '0'
df3['is_fake'] = '0'
dftop1['is_fake'] = '0'
df2['is_fake'] = '1'


# Merge the datasets
merged_df = pd.concat([df0,df1, df3,dftop1,df2], axis=0)
merged_df=merged_df.reset_index()

## Problem 1: Many duplicates included in the dataset, leaves us with less than half of entries

In [3]:
merged_df.drop_duplicates(subset='caption', keep="first", inplace=True)

## Problem 2: The labels are not distributed equally

In [4]:
merged_df.groupby("is_fake").count()

Unnamed: 0_level_0,index,Unnamed: 0,id,permalink,comments_count,like_count,media_type,media_url,timestamp,caption
is_fake,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,18662,18662,18662,18662,18662,17592,18662,15218,18662,18661
1,3273,3273,3273,3273,3273,3124,3273,2391,3273,3273


# Split text from Hashtag

In [5]:

def split_caption_column(caption):
    actual_text_column = []
    hashtags_column = []

   
    for caption in caption:
        try:
            words = caption.split()

            actual_text = []
            hashtags = []

            for word in words:
                if word.startswith('#'):
                    hashtags.append(word)
                else:
                    actual_text.append(word)

            actual_text = ' '.join(actual_text)
            hashtags = ' '.join(hashtags)

            actual_text_column.append(actual_text)
            hashtags_column.append(hashtags)
        except Exception as e:
            print(caption)
            actual_text_column.append(None)
            hashtags_column.append(None)
            print(e)
    return actual_text_column, hashtags_column

actual_text_column, hashtags_column = split_caption_column(merged_df['caption'])

# Add the new columns to the DataFrame
merged_df['actual_text'] = actual_text_column
merged_df['hashtags'] = hashtags_column


nan
'float' object has no attribute 'split'


# Select captions containing english words

### Following cell needs 10 mins to process the entries. Use ewc_df.csv instead of running it.

In [6]:
merged_df.drop(merged_df[merged_df['actual_text'] == ""].index, inplace=True)
merged_df.drop(merged_df[merged_df['actual_text'] == None].index, inplace=True)

In [7]:
saved_df=merged_df

In [8]:
merged_df=merged_df.reset_index()

In [9]:
merged_df['is_english']=""
merged_df['english_caption']=""
for row in range(len(merged_df)):
    try:

        caption= merged_df['actual_text'][row]
        cap_str=str(caption)
    
            
        try:
            
            if detect(cap_str) == 'en':
                merged_df['is_english'][row]=1
                merged_df['english_caption'][row]=cap_str
            else:
                merged_df['is_english'][row]=0
                merged_df['english_caption'][row]=None
        except Exception as e:
            print("inner:", row,e)
            merged_df['is_english'][row]=0
            merged_df['english_caption'][row]=None
    except Exception as e:
        print(e)
        print("outer:"+cap_str)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['is_english'][row]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['english_caption'][row]=None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['is_english'][row]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['english_caption'][row]=cap_str
A value is trying to be s

In [10]:
#Remove non-english rows
merged_df.drop(merged_df[merged_df['is_english'] == 0].index, inplace=True)
#reindexing because of dropped rows
merged_df=merged_df.reset_index(drop=True)

In [11]:
merged_df

Unnamed: 0.1,level_0,index,Unnamed: 0,id,permalink,comments_count,like_count,media_type,media_url,timestamp,caption,is_fake,actual_text,hashtags,is_english,english_caption
0,1,1,1,17993509183855559,https://www.instagram.com/p/CtWQW2WvQqA/,1.0,8.0,IMAGE,https://scontent-iad3-1.cdninstagram.com/v/t51...,2023-06-11T11:04:26+0000,Pacific Joint Space Facility lunarcor_ sister ...,0,Pacific Joint Space Facility lunarcor_ sister ...,#pacificjointspacefacility #johnmoody #competi...,1,Pacific Joint Space Facility lunarcor_ sister ...
1,5,5,5,18005295199826313,https://www.instagram.com/tv/CtWQRtJu02n/,0.0,0.0,VIDEO,https://video-iad3-2.cdninstagram.com/v/t42.17...,2023-06-11T11:03:46+0000,"#news Neil Oliver: ""I say they're lying"".",0,"Neil Oliver: ""I say they're lying"".",#news,1,"Neil Oliver: ""I say they're lying""."
2,9,9,9,18011517661655428,https://www.instagram.com/reel/CtWOR7gAalj/,0.0,23.0,VIDEO,https://scontent-iad3-1.cdninstagram.com/o1/v/...,2023-06-11T11:03:25+0000,What’s your thoughts on Chatgpt ?\n#chatgpt #n...,0,What’s your thoughts on Chatgpt ?,#chatgpt #news #india #m&m #new #thoughts,1,What’s your thoughts on Chatgpt ?
3,10,10,10,18009993862606500,https://www.instagram.com/reel/CtWQI5-BcDn/,7.0,164.0,VIDEO,https://scontent-iad3-1.cdninstagram.com/o1/v/...,2023-06-11T11:03:25+0000,Follow nerdrudransh for more fax 🤨\n\n#adipuru...,0,Follow nerdrudransh for more fax 🤨,#adipurush #movie #cinema #bollywood #actor #p...,1,Follow nerdrudransh for more fax 🤨
4,12,12,12,17982058046476888,https://www.instagram.com/p/CtWQNsYSZ6w/,0.0,0.0,CAROUSEL_ALBUM,,2023-06-11T11:03:11+0000,Follow more new update\n.\n.\n.\n.\n.\n.\n.\n#...,0,Follow more new update . . . . . . .,#currentaffair #gk #ssc #upsc #ssccgl #current...,1,Follow more new update . . . . . . .
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8520,195551,113695,3,18185951629272070,https://www.instagram.com/reel/Ct1lttSg3pT/,0.0,0.0,VIDEO,https://scontent-iad3-1.cdninstagram.com/o1/v/...,2023-06-23T15:08:46+0000,Official Answer… by mattwalshblog \n**********...,1,Official Answer… by mattwalshblog ************...,#WeAreBreitbart #TheEpochTimes #Republicans #D...,1,Official Answer… by mattwalshblog ************...
8521,195552,113696,4,17983389545190690,https://www.instagram.com/p/Ct1luLdLvct/,0.0,0.0,IMAGE,https://scontent-iad3-2.cdninstagram.com/v/t51...,2023-06-23T15:07:34+0000,Fake Fact: Sloths have a 5th leg that is only ...,1,Fake Fact: Sloths have a 5th leg that is only ...,#fact #factsdaily #dailyfacts #aiart #sloths #...,1,Fake Fact: Sloths have a 5th leg that is only ...
8522,195553,113697,5,17970893624266885,https://www.instagram.com/reel/Ct1lXfugsWg/,0.0,1.0,VIDEO,https://scontent-iad3-1.cdninstagram.com/o1/v/...,2023-06-23T15:07:21+0000,Propaganda is so effective it’s fascinating 🖤\...,1,Propaganda is so effective it’s fascinating 🖤,#propaganda #news #fakenews #joerogan #riseaga...,1,Propaganda is so effective it’s fascinating 🖤
8523,195559,113703,11,18010682851736514,https://www.instagram.com/p/Ct1kzAzLcSm/,0.0,0.0,IMAGE,https://scontent-iad3-1.cdninstagram.com/v/t51...,2023-06-23T14:59:30+0000,Biden sniffin’ a troon 👀\n\n******************...,1,Biden sniffin’ a troon 👀 *******************,#WeAreBreitbart #TheEpochTimes #Republicans #D...,1,Biden sniffin’ a troon 👀 *******************


# Remove symbols from other alphabets

In [12]:
for row in range(len(merged_df)):
    if row <= len(merged_df):
        english_caption=""
        caption_str=str(merged_df['english_caption'][row] )
        words= caption_str.split()
        for word in words:
            if re.match(r'^[a-zA-Z0-9,_]+$', word):
                english_caption += ' '+word

        merged_df['english_caption'][row] = english_caption

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['english_caption'][row] = english_caption


In [13]:
merged_df.dropna(subset=['english_caption'], inplace=True)
merged_df=merged_df.reset_index(drop=True)

# NLP

### Pre-processing and tokenizing

In [14]:
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [15]:
def preprocess_caption(caption):
    # Tokenize the caption into individual words
    tokens = word_tokenize(caption)
    
    # Convert tokens to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a processed caption
    processed_caption = ' '.join(tokens)
    
    return processed_caption


In [16]:
merged_df['preprocessed_caption'] = ""
caption=""
for row in range(len(merged_df)):
    try:
        caption= str(merged_df['english_caption'][row])
        merged_df['preprocessed_caption'][row]=preprocess_caption(caption)
    except Exception as e:
        print(e, caption)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['preprocessed_caption'][row]=preprocess_caption(caption)


In [19]:
merged_df

Unnamed: 0.1,level_0,index,Unnamed: 0,id,permalink,comments_count,like_count,media_type,media_url,timestamp,caption,is_fake,actual_text,hashtags,is_english,english_caption,preprocessed_caption
0,1,1,1,17993509183855559,https://www.instagram.com/p/CtWQW2WvQqA/,1.0,8.0,IMAGE,https://scontent-iad3-1.cdninstagram.com/v/t51...,2023-06-11T11:04:26+0000,Pacific Joint Space Facility lunarcor_ sister ...,0,Pacific Joint Space Facility lunarcor_ sister ...,#pacificjointspacefacility #johnmoody #competi...,1,Pacific Joint Space Facility lunarcor_ sister...,pacific joint space facility lunarcor_ sister ...
1,5,5,5,18005295199826313,https://www.instagram.com/tv/CtWQRtJu02n/,0.0,0.0,VIDEO,https://video-iad3-2.cdninstagram.com/v/t42.17...,2023-06-11T11:03:46+0000,"#news Neil Oliver: ""I say they're lying"".",0,"Neil Oliver: ""I say they're lying"".",#news,1,Neil say,neil say
2,9,9,9,18011517661655428,https://www.instagram.com/reel/CtWOR7gAalj/,0.0,23.0,VIDEO,https://scontent-iad3-1.cdninstagram.com/o1/v/...,2023-06-11T11:03:25+0000,What’s your thoughts on Chatgpt ?\n#chatgpt #n...,0,What’s your thoughts on Chatgpt ?,#chatgpt #news #india #m&m #new #thoughts,1,your thoughts on Chatgpt,thought chatgpt
3,10,10,10,18009993862606500,https://www.instagram.com/reel/CtWQI5-BcDn/,7.0,164.0,VIDEO,https://scontent-iad3-1.cdninstagram.com/o1/v/...,2023-06-11T11:03:25+0000,Follow nerdrudransh for more fax 🤨\n\n#adipuru...,0,Follow nerdrudransh for more fax 🤨,#adipurush #movie #cinema #bollywood #actor #p...,1,Follow nerdrudransh for more fax,follow nerdrudransh fax
4,12,12,12,17982058046476888,https://www.instagram.com/p/CtWQNsYSZ6w/,0.0,0.0,CAROUSEL_ALBUM,,2023-06-11T11:03:11+0000,Follow more new update\n.\n.\n.\n.\n.\n.\n.\n#...,0,Follow more new update . . . . . . .,#currentaffair #gk #ssc #upsc #ssccgl #current...,1,Follow more new update,follow new update
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8520,195551,113695,3,18185951629272070,https://www.instagram.com/reel/Ct1lttSg3pT/,0.0,0.0,VIDEO,https://scontent-iad3-1.cdninstagram.com/o1/v/...,2023-06-23T15:08:46+0000,Official Answer… by mattwalshblog \n**********...,1,Official Answer… by mattwalshblog ************...,#WeAreBreitbart #TheEpochTimes #Republicans #D...,1,Official by mattwalshblog,official mattwalshblog
8521,195552,113696,4,17983389545190690,https://www.instagram.com/p/Ct1luLdLvct/,0.0,0.0,IMAGE,https://scontent-iad3-2.cdninstagram.com/v/t51...,2023-06-23T15:07:34+0000,Fake Fact: Sloths have a 5th leg that is only ...,1,Fake Fact: Sloths have a 5th leg that is only ...,#fact #factsdaily #dailyfacts #aiart #sloths #...,1,Fake Sloths have a 5th leg that is only visib...,"fake sloth 5th leg visible uv spectrum , thus ..."
8522,195553,113697,5,17970893624266885,https://www.instagram.com/reel/Ct1lXfugsWg/,0.0,1.0,VIDEO,https://scontent-iad3-1.cdninstagram.com/o1/v/...,2023-06-23T15:07:21+0000,Propaganda is so effective it’s fascinating 🖤\...,1,Propaganda is so effective it’s fascinating 🖤,#propaganda #news #fakenews #joerogan #riseaga...,1,Propaganda is so effective fascinating,propaganda effective fascinating
8523,195559,113703,11,18010682851736514,https://www.instagram.com/p/Ct1kzAzLcSm/,0.0,0.0,IMAGE,https://scontent-iad3-1.cdninstagram.com/v/t51...,2023-06-23T14:59:30+0000,Biden sniffin’ a troon 👀\n\n******************...,1,Biden sniffin’ a troon 👀 *******************,#WeAreBreitbart #TheEpochTimes #Republicans #D...,1,Biden a troon,biden troon


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4c8af7b1-f3b8-45ab-bbdc-6a32713107d1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>