# Yelp (csv)

# [optional] only run if **train_data_full.csv** not exists
# Merging seperated csv data into one Pandas dataframe

In [None]:
import pandas as pd
import time
import datetime
from pathlib import Path
data_dir = Path.cwd() / 'datasets/yelp'

In [None]:
# data_dir = Path.home() / 'GenderPerformance/datasets/yelp'
# 0: business, 1: checkin, 2: photo, 3: review, 4: tip, 5: user
# data_class = {'business': 'business', 'checkin': 'checkin', 'photo': 'photo', 'review': 'review', 'tip': 'tip', 'user': 'user'}

business_df = pd.read_csv(data_dir / 'business.csv', usecols=['business_id', 'name', 'categories'])
review_df = pd.read_csv(data_dir / 'review.csv', usecols=['review_id', 'user_id', 'business_id', 'stars', 'date', 'useful', 'text'])
user_df = pd.read_csv(data_dir / 'user.csv', usecols=['user_id', 'name'])


In [None]:
def get_business_category(business_id):
    return business_df[business_df['business_id'] == business_id]['categories']

def get_username(user_id):
    return user_df[user_df['user_id'] == user_id]['name']

def date_to_timestamp(date_str):
    return time.mktime(datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').timetuple())
    

In [None]:
# refining interested attributes
business_df.rename(columns={'name': 'business_name'}, inplace=True)
business_df = business_df.dropna(axis=0)
user_df.rename(columns={'name': 'user_name'}, inplace=True)

# Extracting  categories correlated to 'restaurants' 
coproduct for NaturalExptCategory(not_pairwise), not used here 

In [None]:
import pandas as pd
data_dir = Path.home() / 'GenderPerformance/datasets/yelp'

business_df = pd.read_csv(data_dir + 'business.csv', usecols=['business_id', 'name', 'categories'])

In [None]:
business_df['categories_lower'] = business_df['categories'].apply(lambda x: str(x).lower())

In [None]:
# categories = business_df['categories_lower'].unique()
categories = business_df['categories_lower']

In [None]:
# directly correlated to 'restaurants'
split_cates = [None] * len(categories)
correlated_categories = {}
for c_k, c_v in enumerate(categories):
    if 'restaurants' in c_v:
        split_cates[c_k] = c_v.lower().split(',')
        for c in split_cates[c_k]:
            c_striped = c.strip()
            correlated_categories[c_striped] = correlated_categories.get(c_striped, 0) + 1
 
with open(data_dir / 'directly_correlated_categories.json', 'w') as f:
    json.dump(correlated_categories, f)

sorted_correlated_categories = sorted(correlated_categories.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_correlated_categories[:11])

In [None]:
# correlated_categories = {'restaurants': False, 'chinese': False}
import queue
q = queue.Queue()
for key in correlated_categories:
    q.put(key)

In [None]:
print('# of directly correlated: ', len(correlated_categories))

def bfs_category():
    while not q.empty():
        element = q.get()
        if element is None:
            break;

#         print('element:', element)
        for c in categories:
            if element in c:
                split_categories = c.lower().split(',')
                for split_categoy in split_categories:
                    split_categoy = split_categoy.strip()
                    correlated_categories[split_categoy] = correlated_categories.get(split_categoy, 0) + 1
                    if split_categoy not in correlated_categories:
#                         print('new:', split_categoy)
                        q.put(split_categoy)
bfs_category()
print('# of directly and undirectly correlated: ', len(correlated_categories))

In [None]:
# len(correlated_categories)
sorted_cates = sorted(correlated_categories.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_cates[:11])

In [None]:
len(correlated_categories)
import json
with open(data_dir / 'all_correlated_categories.json', 'w') as f:
    json.dump(correlated_categories, f)

# Counting top N unique independent categories
coproduct for NaturalExptCategory(not_pairwise), not used here

In [None]:
business_df = business_df.dropna(axis=0)
categories_column =  business_df.categories

In [None]:
category_freq = {}
for cate in categories_column:
    for sub_cate in cate.lower().split(','):
        sub_cate = sub_cate.strip()
        category_freq[sub_cate] = category_freq.get(sub_cate, 0) + 1

In [None]:
sorted_category_freq = sorted(category_freq.items(), key=lambda kv: kv[1], reverse=True)
with open(data_dir / 'top_10_categories.json', 'w') as f:
    json.dump(sorted_category_freq[:10], f)
len(sorted_category_freq)

In [None]:
sorted_category_freq

In [None]:
import json
directly = True
if directly:
    with open(data_dir / 'directly_correlated_categories.json') as json_file:
        restaurants_related_categories = json.load(json_file)
else:
    with open(data_dir / 'all_correlated_categories.json', 'w') as json_file:
        restaurants_related_categories = json.load(json_file)

In [None]:
interest_categories = {}
top_n = 5
for category_tuple in sorted_category_freq:
    if len(interest_categories) >= top_n:
        break
    
#     print(category_tuple[0], correlated_categories[category_tuple[0]])
    if category_tuple[0] not in restaurants_related_categories:
        interest_categories[category_tuple[0]] = category_tuple[1]


interest_categories['restaurants'] = correlated_categories['restaurants']
print('interested categories:', interest_categories)
print(sorted_category_freq[:10])
import json
with open(data_dir / 'top_n_correlated_categories.json', 'w') as f:
    json.dump(interest_categories, f)

# Merging dataframes

In [None]:
merged_df = pd.merge(review_df, business_df, on='business_id', how='outer')
merged_df = pd.merge(merged_df, user_df, on='user_id', how='outer')

In [None]:
merged_df = merged_df.dropna(axis=0) # drop rows where NAN exists

In [None]:
merged_df['timestamp'] = merged_df['date'].apply(lambda x:date_to_timestamp(x))

In [None]:
merged_df['text'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
merged_df.to_csv(data_dir / 'train_data_full.csv', index=False)

In [None]:
merged_df.head()

In [None]:
merged_df = pd.read_csv(data_dir / 'train_data_full.csv')
merged_df.head()

In [None]:
data_dir = Path.home() / 'GenderPerformance/datasets/yelp'
business_df = pd.read_csv(data_dir + 'business.csv', usecols=['business_id', 'categories'])

In [None]:
merged_df = pd.merge(merged_df, business_df, on='business_id', how='outer')

In [None]:
merged_df.to_csv('train_data_full_cate.csv', index=False)

# Reading complete data from csv file

In [None]:
import os.path
import pandas as pd
if not os.path.exists(data_dir / 'train_data_full.csv'):
    merged_df.to_csv('train_data_full.csv', index=False)

if merged_df is None:
    merged_df = pd.read_csv(data_dir / 'train_data_full.csv')
merged_df.head()

In [None]:
import gender_guesser.detector as gender

d = gender.Detector(case_sensitive=False)

import string
translator = str.maketrans('', '', string.punctuation)

def get_gender(name):
    try:
        n = name.lower()
        if ('mom' in n) or ('girl' in n) or ('angel' in n) or ('mum' in n) or ('mother' in n) or ('woman' in n):
            return 'female'
        if ('boy' in n) or ('dude' in n):
            return 'male'
        temp = name.translate(translator).split()
    except:
        return 'unknown'
    
    if len(temp) > 0:
        first_name = temp[0]
        first_name = ''.join([i for i in first_name if not i.isdigit()])
        gender = d.get_gender(first_name,'usa')
        return gender
    else:
        return 'unknown'

In [None]:
print(get_gender())

In [None]:
merged_df['gender'] = merged_df['user_name'].apply(lambda x:get_gender(x))
merged_df.head()

In [None]:
disclosed_flags = merged_df['gender'].isin(['female', 'male'])
undisclosed_flags = ~disclosed_flags

In [None]:
disclosed_gender_df = merged_df[disclosed_flags]
disclosed_gender_df.head()

In [None]:
disclosed_gender_df = disclosed_gender_df.drop(columns=['gender'])
disclosed_gender_df.head()

In [None]:
undisclosed_gender_df = merged_df[undisclosed_flags]
undisclosed_gender_df = undisclosed_gender_df.drop(columns=['Gender'])
undisclosed_gender_df.head()

# Save disclosed and undisclosed whole data to csv, if csv file exists, reading them in

In [None]:
import os.path
import pandas as pd
if not os.path.exists(data_dir / 'disclosed_dataset.csv'):
    print('saving disclosed dataset to csv')
    disclosed_gender_df.to_csv(data_dir / 'disclosed_dataset.csv', index=False)
else:
    disclosed_gender_df = pd.read_csv(data_dir / 'disclosed_dataset.csv')

if not os.path.exists(data_dir / 'undisclosed_dataset.csv'):
    print('saving undisclosed dataset to csv')
    undisclosed_gender_df.to_csv(data_dir / 'undisclosed_dataset.csv', index=False)
else:
    undisclosed_gender_df = pd.read_csv(data_dir / 'undisclosed_dataset.csv')

In [None]:
undisclosed_gender_df = undisclosed_gender_df[['user_id', 'text', 'gender']]
# undisclosed_gender_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
undisclosed_gender_df.to_csv(data_dir / 'undisclosed_id_text_gender.csv', index=False, header=False)

In [None]:
mapping = {'female' : 1, 'male' : 0}
disclosed_dataset_df = disclosed_gender_df[['user_name', 'text', 'gender']]
disclosed_dataset_df.replace({'gender': mapping}, inplace=True)
disclosed_dataset_df.head()

# plot data distribution

In [None]:
DF_shape = disclosed_dataset_df.loc[disclosed_dataset_df['gender'] == 1].shape
DM_shape = disclosed_dataset_df.loc[disclosed_dataset_df['gender'] == 0].shape
UNDIS_shape = undisclosed_gender_df.shape

In [None]:
total_size = disclosed_dataset_df.shape[0] + UNDIS_shape[0]

In [None]:
print(total_size, disclosed_dataset_df.shape[0], UNDIS_shape[0], DM_shape[0], DF_shape[0])

In [None]:
import matplotlib.pyplot as plt
labels = 'SM', 'SW', 'Performing'
sizes = [DM_shape[0]/total_size, DF_shape[0]/total_size, UNDIS_shape[0]/total_size]
explode = (0, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

# Split disclosed dataset into train, test and validation
# train: text, gender
# test: name, text, gender
# validation: text, gender

In [None]:
from sklearn.model_selection import train_test_split

# dataset_df = pd.read_csv('train_dataset.csv')
if 'disclosed_dataset_df' in locals():
    # 80/20 train/test
    train_df, test_df = train_test_split(disclosed_dataset_df, test_size=0.2)
    # 80/20 train/validation
    train_df, validation_df = train_test_split(train_df, test_size=0.2)

In [None]:
# extracting related attributes for training, validation and test
train_gender_text_df = train_df[['gender', 'text']]
validation_gender_text_df = validation_df[['gender', 'text']]
test_name_text_gender_df = test_df[['user_name', 'text', 'gender']]

In [None]:
# v2_test_df = pd.read_csv('v2_test_data.csv', engine='python',  encoding='utf-8', error_bad_lines=False, header=None)
# train_gender_text_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
train_gender_text_df.to_csv(data_dir / 'training_gender_text.csv', index=False, header=False)

In [None]:
# validation_gender_text_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
validation_gender_text_df.to_csv(data_dir / 'validation_gender_text.csv', index=False, header=False)

In [None]:
# test_name_text_gender_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
test_name_text_gender_df.to_csv(data_dir / 'test_name_text_gender.csv', index=False, header=False)

# Helper function removing escape chars and checking if successfully removed

In [None]:
# Legacy
def remove_(x):
    x = str(x).replace('\n', ' ')
    return x

v1_test_df['Review'] = v1_test_df[1].apply(lambda x:remove_(x))

def check(x):
    if '\n' in x:
        return True
    return False

v1_test_df['check'] = v1_test_df['Review'].apply(lambda x:check(x))