In [1]:
import csv
import numpy as np
import pandas as pd

import math

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [2]:
## Read csv into pandas DataFrame and drop memtioned columns

def Read_DataFile(filepath, Columns_to_drop=[]):
    df = pd.read_csv(filepath)
    df = df.drop(columns=Columns_to_drop)
    return df

In [3]:
## Merge multiple datafiles into single file (all files must have same columns)

def Merge_DataFiles(filepath_list, Output_filepath):
    
    merged_df = Read_DataFile(filepath_list[0])
    for i in range(1,len(filepath_list)):
        filepath = filepath_list[i]
        temp_df = Read_DataFile(filepath)
        merged_df = pd.concat([merged_df, temp_df])
    
    merged_df.to_csv(Output_filepath)
    return merged_df

In [4]:
## Data Preprocessing on caption, comments, hashtags

def Data_Processing(data):
    
    Column_list = ["Hashtags", "Tags_Len", "Caption_Tokens", "Cap_Tokens_Len", "Comments_Tokens", "Com_Tokens_Len"]
    processed_df = pd.DataFrame(columns=Column_list)
    Req_Columns = ["text","comments","hashtags"]
    Req_Data = data[Req_Columns]
    
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9_#]+')
    stop_words = set(stopwords.words('english'))
    
    for row in range(Req_Data.shape[0]):
        
        Hashtags = set()
        Caption = []
        Comments = []
        
        for col in Req_Columns:
            
            text_content = Req_Data.iloc[row][col]
            
            if type(text_content) != str:
                continue

            elif col=="hashtags":

                ## Tokenize with delimiter space, No further processing for hashtags
                tokens = text_content.split()
                
                for token in tokens:
                    if token.startswith('#'):
                        Hashtags.add(token)
            else:

                ## Tokenize with delimiter space
                tokens = tokenizer.tokenize(text_content)

                for token in tokens:

                    ## Remove numbers
                    if token.isnumeric():
                        continue
                    
                    ## Remove hashtags from caption, comments and add them to hashtags list
                    elif token.startswith('#'):
                        Hashtags.add(token)
                    
                    ## Remove stopwords
                    elif token.casefold() in stop_words:
                        continue
                    else:
                        if col == "text":
                            Caption.append(token.casefold())
                        else:
                            Comments.append(token.casefold()) 
        
        Hashtag_string = ' '.join(str(tag) for tag in Hashtags)
        Caption_string = ' '.join(str(c) for c in Caption)
        Comments_string = ' '.join(str(c) for c in Comments)
        
        temp_df = pd.DataFrame([[Hashtag_string,len(Hashtags),Caption_string,len(Caption),Comments_string,len(Comments)]], columns=Column_list)
       
        if row==0:
            processed_df = temp_df
        else:
            processed_df = pd.concat([processed_df, temp_df], ignore_index=True)
        
    return processed_df
        

In [5]:
def Filter_Data(Data, Hashtags_Threshold=1, Caption_Threshold=10, Comments_Threshold=10):
    
    print("Original data shape: ", Data.shape)
    index_to_drop = []
    
    for row in range(Data.shape[0]):
        
        row_entry = Data.iloc[row] 
        flag = 0
        
        if row_entry['Tags_Len'].item() < Hashtags_Threshold:
            flag = 1
        elif row_entry['Cap_Tokens_Len'].item() < Caption_Threshold:
            flag = 1
        elif row_entry['Com_Tokens_Len'].item() < Comments_Threshold:
            flag = 1
        else:
            continue
            
        if flag==1:
            index_to_drop.append(Data.index[row])
            
    Filtered_Data = Data.drop(index=index_to_drop)      
    print("Filtered data shape: ", Filtered_Data.shape)
    
    return Filtered_Data
    

In [6]:
Files_to_merge = ["./Data/everydaysexism.csv","./Data/genderbias.csv","./Data/genderstereotype.csv","./Data/heforshe.csv",
                  "./Data/mencallmethings.csv","./Data/metoo.csv","./Data/misogynist.csv","./Data/notallmen.csv",
                  "./Data/questionsformen.csv","./Data/slutgate.csv","./Data/wagegap.csv","./Data/weareequal.csv",
                  "./Data/womenareinferior.csv","./Data/workplaceharassment.csv","./Data/yesallwomen.csv"]

merged_df = Merge_DataFiles(Files_to_merge, "./Data/Merged_Data.csv")
print("Merged Data Shape: ",merged_df.shape)

Processed_Df = Data_Processing(merged_df)
print("Processed Data Shape: ", Processed_Df.shape)
Processed_Df.to_csv("./Data/Processed_Data.csv")




of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


Merged Data Shape:  (12484, 6)
Processed Data Shape:  (12484, 6)


In [25]:
Filtered_Data_Analysis = []

for i in range(1,11,1):    
    for j in range(5,11,1):     
        for k in range(5,11,1):
            
            Filtered_Data = Filter_Data(Processed_Df,i,j,k)
            row_entry = np.array([int(i), int(j), int(k), int(Filtered_Data.shape[0])])
            Filtered_Data_Analysis.append(row_entry)
            
            print(row_entry)
            print("======================")

print("========= Filtered_Data_Analysis ============")
print(Filtered_Data_Analysis)
        

Original data shape:  (12484, 6)
Filtered data shape:  (5635, 6)
[   1    5    5 5635]
Original data shape:  (12484, 6)
Filtered data shape:  (5331, 6)
[   1    5    6 5331]
Original data shape:  (12484, 6)
Filtered data shape:  (5042, 6)
[   1    5    7 5042]
Original data shape:  (12484, 6)
Filtered data shape:  (4777, 6)
[   1    5    8 4777]
Original data shape:  (12484, 6)
Filtered data shape:  (4558, 6)
[   1    5    9 4558]
Original data shape:  (12484, 6)
Filtered data shape:  (4350, 6)
[   1    5   10 4350]
Original data shape:  (12484, 6)
Filtered data shape:  (5398, 6)
[   1    6    5 5398]
Original data shape:  (12484, 6)
Filtered data shape:  (5106, 6)
[   1    6    6 5106]
Original data shape:  (12484, 6)
Filtered data shape:  (4832, 6)
[   1    6    7 4832]
Original data shape:  (12484, 6)
Filtered data shape:  (4579, 6)
[   1    6    8 4579]
Original data shape:  (12484, 6)
Filtered data shape:  (4373, 6)
[   1    6    9 4373]
Original data shape:  (12484, 6)
Filtered d

Filtered data shape:  (4576, 6)
[   3    5    8 4576]
Original data shape:  (12484, 6)
Filtered data shape:  (4364, 6)
[   3    5    9 4364]
Original data shape:  (12484, 6)
Filtered data shape:  (4165, 6)
[   3    5   10 4165]
Original data shape:  (12484, 6)
Filtered data shape:  (5192, 6)
[   3    6    5 5192]
Original data shape:  (12484, 6)
Filtered data shape:  (4906, 6)
[   3    6    6 4906]
Original data shape:  (12484, 6)
Filtered data shape:  (4643, 6)
[   3    6    7 4643]
Original data shape:  (12484, 6)
Filtered data shape:  (4395, 6)
[   3    6    8 4395]
Original data shape:  (12484, 6)
Filtered data shape:  (4195, 6)
[   3    6    9 4195]
Original data shape:  (12484, 6)
Filtered data shape:  (4012, 6)
[   3    6   10 4012]
Original data shape:  (12484, 6)
Filtered data shape:  (5008, 6)
[   3    7    5 5008]
Original data shape:  (12484, 6)
Filtered data shape:  (4736, 6)
[   3    7    6 4736]
Original data shape:  (12484, 6)
Filtered data shape:  (4492, 6)
[   3    7 

Filtered data shape:  (4929, 6)
[   5    6    5 4929]
Original data shape:  (12484, 6)
Filtered data shape:  (4658, 6)
[   5    6    6 4658]
Original data shape:  (12484, 6)
Filtered data shape:  (4408, 6)
[   5    6    7 4408]
Original data shape:  (12484, 6)
Filtered data shape:  (4171, 6)
[   5    6    8 4171]
Original data shape:  (12484, 6)
Filtered data shape:  (3978, 6)
[   5    6    9 3978]
Original data shape:  (12484, 6)
Filtered data shape:  (3800, 6)
[   5    6   10 3800]
Original data shape:  (12484, 6)
Filtered data shape:  (4755, 6)
[   5    7    5 4755]
Original data shape:  (12484, 6)
Filtered data shape:  (4497, 6)
[   5    7    6 4497]
Original data shape:  (12484, 6)
Filtered data shape:  (4265, 6)
[   5    7    7 4265]
Original data shape:  (12484, 6)
Filtered data shape:  (4042, 6)
[   5    7    8 4042]
Original data shape:  (12484, 6)
Filtered data shape:  (3853, 6)
[   5    7    9 3853]
Original data shape:  (12484, 6)
Filtered data shape:  (3686, 6)
[   5    7 

Filtered data shape:  (3944, 6)
[   7    6    8 3944]
Original data shape:  (12484, 6)
Filtered data shape:  (3764, 6)
[   7    6    9 3764]
Original data shape:  (12484, 6)
Filtered data shape:  (3601, 6)
[   7    6   10 3601]
Original data shape:  (12484, 6)
Filtered data shape:  (4501, 6)
[   7    7    5 4501]
Original data shape:  (12484, 6)
Filtered data shape:  (4257, 6)
[   7    7    6 4257]
Original data shape:  (12484, 6)
Filtered data shape:  (4035, 6)
[   7    7    7 4035]
Original data shape:  (12484, 6)
Filtered data shape:  (3822, 6)
[   7    7    8 3822]
Original data shape:  (12484, 6)
Filtered data shape:  (3646, 6)
[   7    7    9 3646]
Original data shape:  (12484, 6)
Filtered data shape:  (3494, 6)
[   7    7   10 3494]
Original data shape:  (12484, 6)
Filtered data shape:  (4356, 6)
[   7    8    5 4356]
Original data shape:  (12484, 6)
Filtered data shape:  (4117, 6)
[   7    8    6 4117]
Original data shape:  (12484, 6)
Filtered data shape:  (3906, 6)
[   7    8 

Filtered data shape:  (4244, 6)
[   9    7    5 4244]
Original data shape:  (12484, 6)
Filtered data shape:  (4014, 6)
[   9    7    6 4014]
Original data shape:  (12484, 6)
Filtered data shape:  (3812, 6)
[   9    7    7 3812]
Original data shape:  (12484, 6)
Filtered data shape:  (3617, 6)
[   9    7    8 3617]
Original data shape:  (12484, 6)
Filtered data shape:  (3452, 6)
[   9    7    9 3452]
Original data shape:  (12484, 6)
Filtered data shape:  (3306, 6)
[   9    7   10 3306]
Original data shape:  (12484, 6)
Filtered data shape:  (4110, 6)
[   9    8    5 4110]
Original data shape:  (12484, 6)
Filtered data shape:  (3884, 6)
[   9    8    6 3884]
Original data shape:  (12484, 6)
Filtered data shape:  (3692, 6)
[   9    8    7 3692]
Original data shape:  (12484, 6)
Filtered data shape:  (3498, 6)
[   9    8    8 3498]
Original data shape:  (12484, 6)
Filtered data shape:  (3345, 6)
[   9    8    9 3345]
Original data shape:  (12484, 6)
Filtered data shape:  (3202, 6)
[   9    8 

In [26]:
Filtered_Data_Analysis = pd.DataFrame(Filtered_Data_Analysis)
Filtered_Data_Analysis.to_csv("./Data/Filtered_Data_Analysis.csv", header=["Hashtags_Threshold", 
                                                                          "Caption_Len_Threshold",
                                                                          "Comments_Len_Threshold",
                                                                          "Filtered_Data_Size"])


In [29]:
Filtered_Data = Filter_Data(Processed_Df,10,10,10)
Filtered_Data.to_csv("./Data/Filtered_Data.csv")

Original data shape:  (12484, 6)
Filtered data shape:  (2909, 6)


In [None]:
## Merge all files of different hashtags into one file, Drop unnecessary columns
## Remove non_english text(caption, hashtags, comments) from the post
## Tokenize text of each post
## Remove punctuations(except delimiters used), emojis, numbers, stopwords from text(caption, comments)
## Data Analysis: length_of_text vs no_of_posts (Zipf distribution), no_of_hashtags vs no_of_posts,    
##                Distribution of posts as per hashtags
## Remove posts having text of length less than particular threshold

