# Yelp (csv)

# [optional] only run if **train_data_full.csv** not exists
# Merging seperated csv data into one Pandas dataframe

In [None]:
import pandas as pd
import time
import datetime

In [None]:
data_dir = 'C:\\Users\\wiekern\\Desktop\\Masterarbeit\\yelp_dataset\\'
# 0: business, 1: checkin, 2: photo, 3: review, 4: tip, 5: user
data_class = {'business': 'business', 'checkin': 'checkin', 'photo': 'photo', 'review': 'review', 'tip': 'tip', 'user': 'user'}
file_type = '.csv'

business_df = pd.read_csv(data_dir + data_class['business'] + file_type, usecols=['business_id', 'name', 'categories'])
review_df = pd.read_csv(data_dir + data_class['review'] + file_type, usecols=['review_id', 'user_id', 'business_id', 'stars', 'date', 'useful', 'text'])
user_df = pd.read_csv(data_dir + data_class['user'] + file_type, usecols=['user_id', 'name'])


In [None]:
def get_business_category(business_id):
    return business_df[business_df['business_id'] == business_id]['categories']

def get_username(user_id):
    return user_df[user_df['user_id'] == user_id]['name']

def date_to_timestamp(date_str):
    return time.mktime(datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').timetuple())
    

In [None]:
# refining interested attributes
reivew_df_refine = review_df
business_df_refine = business_df
business_df_refine.columns = ['business_id', 'business_name', 'categories']
business_df_refine = business_df_refine.dropna(axis=0)
user_df_refine =  user_df
user_df_refine.columns =  ['user_id', 'user_name']

# Extracting  categories correlated to 'restaurants'

In [1]:
import pandas as pd
data_dir = './csv_files/'

business_df = pd.read_csv(data_dir + 'business.csv', usecols=['business_id', 'name', 'categories'])

In [2]:
business_df['categories_lower'] = business_df['categories'].apply(lambda x: str(x).lower())

In [3]:
categories = business_df['categories_lower'].unique()

In [19]:
split_cates = [None] * len(categories)
flags = [False] * len(categories)
correlated_categories = {}
for c_k, c_v in enumerate(categories):
    if 'restaurants' in c_v:
        split_cates[c_k] = c_v.lower().split(',')
        for c in split_cates[c_k]:
            correlated_categories[c] = False
        flags[c_k] = True
    else:
        flags[c_k] = False

In [20]:
# correlated_categories = {'restaurants': False, 'chinese': False}
import queue
q = queue.Queue()
for key in correlated_categories:
    q.put(key)

In [21]:
print(len(correlated_categories))
def bfs_category():
    while not q.empty():
        element = q.get()
        if element is None:
            break;
        if not correlated_categories[element]:  # not visited
            correlated_categories[element] = True
            for c in categories:
                if element in c:
                    split_categories = c.lower().split(',')
                    for split_categoy in split_categories:
                        split_categoy = split_categoy.strip()
                        if split_categoy not in correlated_categories:
                            correlated_categories[split_categoy] = False
                            q.put(split_categoy)
bfs_category()

1182


In [23]:
len(correlated_categories)
import json
with open('correlated_categories.json', 'w') as f:
    json.dump(correlated_categories, f)

# Counting top N categories

In [None]:
category_freq = {}
for cate in business_df_refine.categories:
    for sub_cate in cate.lower().split(','):
        sub_cate = sub_cate.strip()
        category_freq[sub_cate] = category_freq.get(sub_cate, 0) + 1

In [None]:
sorted_category_freq = sorted(category_freq.items(), key=lambda kv: kv[1], reverse=True)
sorted_category_freq

# Merging dataframes

In [None]:
merged_df = pd.merge(reivew_df_refine, business_df_refine, on='business_id', how='outer')
merged_df = pd.merge(merged_df, user_df_refine, on='user_id', how='outer')

In [None]:
merged_df = merged_df.dropna(axis=0) # drop rows where NAN exists

In [None]:
merged_df['timestamp'] = merged_df['date'].apply(lambda x:date_to_timestamp(x))

In [None]:
merged_df.head()

In [None]:
merged_df = pd.read_csv('train_data_full.csv')
merged_df.head()

In [None]:
data_dir = 'C:\\Users\\wiekern\\Desktop\\Masterarbeit\\yelp_dataset\\'
business_df = pd.read_csv(data_dir + 'business.csv', usecols=['business_id', 'categories'])

In [None]:
merged_df = pd.merge(merged_df, business_df, on='business_id', how='outer')

In [None]:
merged_df.to_csv('train_data_full_cate.csv', index=False)

# Reading complete data from csv file

In [None]:
import os.path
import pandas as pd
if not os.path.exists('train_data_full.csv'):
    merged_df.to_csv('train_data_full.csv', index=False)

if merged_df is None:
    merged_df = pd.read_csv('train_data_full.csv')
merged_df.head()

In [None]:
import gender_guesser.detector as gender

d = gender.Detector(case_sensitive=False)

import string
translator = str.maketrans('', '', string.punctuation)

def get_gender(name):
    try:
        n = name.lower()
        if ('mom' in n) or ('girl' in n) or ('angel' in n) or ('mum' in n) or ('mother' in n) or ('woman' in n):
            return 'female'
        if ('boy' in n) or ('dude' in n):
            return 'male'
        temp = name.translate(translator).split()
    except:
        return 'unknown'
    
    if len(temp) > 0:
        first_name = temp[0]
        first_name = ''.join([i for i in first_name if not i.isdigit()])
        gender = d.get_gender(first_name,'usa')
        return gender
    else:
        return 'unknown'

In [None]:
merged_df['gender'] = merged_df['user_name'].apply(lambda x:get_gender(x))
merged_df.head()

In [None]:
disclosed_flags = merged_df['gender'].isin(['female', 'male'])
undisclosed_flags = ~disclosed_flags

In [None]:
disclosed_gender_df = merged_df[disclosed_flags]
disclosed_gender_df.head()

In [None]:
undisclosed_gender_df = merged_df[undisclosed_flags]
undisclosed_gender_df.head()

In [None]:
import os.path
import pandas as pd
if not os.path.exists('disclosed_dataset.csv'):
    print('saving disclosed dataset to csv')
    disclosed_gender_df.to_csv('disclosed_dataset.csv', index=False)
else:
    disclosed_gender_df = pd.read_csv('disclosed_dataset.csv')

if not os.path.exists('undisclosed_dataset.csv'):
    print('saving undisclosed dataset to csv')
    undisclosed_gender_df.to_csv('undisclosed_dataset.csv', index=False)
else:
    undisclosed_gender_df = pd.read_csv('undisclosed_dataset.csv')

In [None]:
undisclosed_gender_df.to_csv('undisclosed_dataset.csv', index=False)

In [None]:
undisclosed_gender_df = undisclosed_gender_df[['user_id', 'text', 'gender']]
undisclosed_gender_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
undisclosed_gender_df.to_csv('undisclosed_id_text_gender.csv', index=False, header=False)

In [None]:
mapping = {'female' : 1, 'male' : 0}
disclosed_dataset_df = disclosed_gender_df[['user_name', 'text', 'gender']]
disclosed_dataset_df.replace({'gender': mapping}, inplace=True)
disclosed_dataset_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# dataset_df = pd.read_csv('train_dataset.csv')
if 'disclosed_dataset_df' in locals():
    # 80/20 train/test
    train_df, test_df = train_test_split(disclosed_dataset_df, test_size=0.2)
    # 80/20 train/validation
    train_df, validation_df = train_test_split(train_df, test_size=0.2)

In [None]:
# extracting related attributes for training, validation and test
train_gender_text_df = train_df[['gender', 'text']]
validation_gender_text_df = validation_df[['gender', 'text']]
test_name_text_gender_df = test_df[['user_name', 'text', 'gender']]

In [None]:
# v2_test_df = pd.read_csv('v2_test_data.csv', engine='python',  encoding='utf-8', error_bad_lines=False, header=None)
train_gender_text_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
train_gender_text_df.to_csv('training_gender_text.csv', index=False, header=False)

In [None]:
validation_gender_text_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
validation_gender_text_df.to_csv('validation_gender_text.csv', index=False, header=False)

In [None]:
test_name_text_gender_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
test_name_text_gender_df.to_csv('test_name_text_gender.csv', index=False, header=False)

In [None]:
import pandas as pd
undisclosed_csv_df = pd.read_csv('undisclosed_dataset.csv')
undisclosed_csv_df.head()

In [None]:
undisclosed_dataset_df = undisclosed_csv_df[['user_id', 'text', 'gender']]
undisclosed_dataset_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
undisclosed_dataset_df.to_csv('undisclosed_id_text_gender.csv', index=False, header=False)

In [None]:
# Legacy
def remove_(x):
    x = str(x).replace('\n',' ')
    return x

v1_test_df['Review'] = v1_test_df[1].apply(lambda x:remove_(x))

def check(x):
    if '\n' in x:
        return True
    return False

v1_test_df['check'] = v1_test_df['Review'].apply(lambda x:check(x))