In [1]:
import sqlite3
import pandas as pd

In [2]:
# Load JSON data in chunks to avoid memory issues
def load_dataset(file_lists, prefix_path, chunk_size=10000):
    df_dict = {}
    for file in file_lists:
        try:
            df_chunks = []
            total_records = 0

            for chunk in pd.read_json(prefix_path + file, lines=True, chunksize=chunk_size):
                df_chunks.append(chunk)
                total_records += chunk.shape[0]

            df = pd.concat(df_chunks, ignore_index=True)
            df_dict[file] = df
            print(f"Total records in {file}: {df.shape[0]}.")

        except Exception as e:
            print(f"Error: {e}")
            continue
    return df_dict

In [3]:
# Convert Timestamps to strings in the dataset
def convert_timestamp_to_str(df, date_columns):
    for col in date_columns:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else None)
    return df

In [4]:
# Insert business data with additional fields for attributes and hours
def insert_business_data(df_business, conn, batch_size=10000):
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')

    for i in range(0, len(df_business), batch_size):
        batch = df_business.iloc[i:i + batch_size]
        
        # Convert attributes and hours dictionaries to strings
        batch['attributes'] = batch['attributes'].apply(lambda x: str(x) if pd.notnull(x) else None)
        batch['hours'] = batch['hours'].apply(lambda x: str(x) if pd.notnull(x) else None)

        # Insert business details with new columns
        cursor.executemany('''INSERT OR IGNORE INTO business_details 
                              (business_id, name, address, city, state, postal_code, latitude, longitude, stars, review_count, is_open, attributes, hours)
                              VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                           batch[['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'hours']].values.tolist())

        # Insert categories
        for _, row in batch.iterrows():
            categories = row['categories'].split(', ') if row['categories'] else []
            cursor.executemany('INSERT OR IGNORE INTO business_categories (business_id, category) VALUES (?, ?)', 
                               [(row['business_id'], cat) for cat in categories])

    conn.commit()

In [4]:
# Insert checkin data into business database in batches
def insert_checkin_data(df_checkin, conn, batch_size=10000):
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')

    for i in range(0, len(df_checkin), batch_size):
        batch = df_checkin.iloc[i:i + batch_size]
        cursor.executemany('''INSERT OR IGNORE INTO checkin_data 
                              (business_id, checkin_date)
                              VALUES (?, ?)''',
                           batch[['business_id', 'date']].values.tolist())

    conn.commit()

In [8]:
# Insert review data
def insert_review_data(df_review, conn, batch_size=10000):
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')

    # Convert 'date' to string
    df_review = convert_timestamp_to_str(df_review, ['date'])

    for i in range(0, len(df_review), batch_size):
        batch = df_review.iloc[i:i + batch_size]
        cursor.executemany('''INSERT OR IGNORE INTO review_data 
                              (review_id, user_id, business_id, stars, date, text, sentiment_score, useful, funny, cool)
                              VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                           batch[['review_id', 'user_id', 'business_id', 'stars', 'date', 'text', 'sentiment_score', 'useful', 'funny', 'cool']].values.tolist())

    conn.commit()

In [7]:
# Insert user data with friends, elite, and compliments
def insert_user_data(df_user, conn, batch_size=10000):
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')

    for i in range(0, len(df_user), batch_size):
        batch = df_user.iloc[i:i + batch_size]

        # Convert arrays to strings
        batch['friends'] = batch['friends'].apply(lambda x: ', '.join(x) if isinstance(x, list) else None)
        batch['elite'] = batch['elite'].apply(lambda x: ', '.join(str(e) for e in x) if isinstance(x, list) else None)

        # Insert user data with new columns
        cursor.executemany('''INSERT OR IGNORE INTO user_data 
                              (user_id, name, review_count, yelping_since, useful, funny, cool, fans, average_stars, friends, elite, compliment_hot, compliment_more, compliment_profile, compliment_cute, compliment_list, compliment_note, compliment_plain, compliment_cool, compliment_funny, compliment_writer, compliment_photos)
                              VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                           batch[['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'fans', 'average_stars', 'friends', 'elite', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos']].values.tolist())

    conn.commit()

In [8]:
# Insert tip data
def insert_tip_data(df_tip, conn, batch_size=10000):
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')

    # Convert 'date' to string
    df_tip = convert_timestamp_to_str(df_tip, ['date'])

    for i in range(0, len(df_tip), batch_size):
        batch = df_tip.iloc[i:i + batch_size]
        cursor.executemany('''INSERT OR IGNORE INTO tip_data 
                              (user_id, business_id, text, date, compliment_count)
                              VALUES (?, ?, ?, ?, ?)''',
                           batch[['user_id', 'business_id', 'text', 'date', 'compliment_count']].values.tolist())

    conn.commit()

In [None]:
# Create tables for Business, Review, User, and Tip data
def create_tables(conn, table_type):
    cursor = conn.cursor()
    
    if table_type == 'business':
        # Create table for business details with additional fields for attributes and hours
        cursor.execute('''CREATE TABLE IF NOT EXISTS business_details (
                            business_id TEXT PRIMARY KEY,
                            name TEXT,
                            address TEXT,
                            city TEXT,
                            state TEXT,
                            postal_code TEXT,
                            latitude REAL,
                            longitude REAL,
                            stars REAL,
                            review_count INTEGER,
                            is_open INTEGER,
                            attributes TEXT,   -- Storing attributes dictionary as a string
                            hours TEXT          -- Storing hours dictionary as a string
                        )''')

        # Create table for business categories
        cursor.execute('''CREATE TABLE IF NOT EXISTS business_categories (
                            business_id TEXT,
                            category TEXT,
                            FOREIGN KEY (business_id) REFERENCES business_details(business_id)
                        )''')

        # Create table for checkins
        cursor.execute('''CREATE TABLE IF NOT EXISTS checkin_data (
                            business_id TEXT,
                            checkin_date TEXT,  -- Format: YYYY-MM-DD HH:MM:SS
                            FOREIGN KEY (business_id) REFERENCES business_details(business_id)
                        )''')
    
    elif table_type == 'user':
        # Create table for users with additional fields for friends, elite, and compliments
        cursor.execute('''CREATE TABLE IF NOT EXISTS user_data (
                            user_id TEXT PRIMARY KEY,
                            name TEXT,
                            review_count INTEGER,
                            yelping_since TEXT,
                            useful INTEGER,
                            funny INTEGER,
                            cool INTEGER,
                            fans INTEGER,
                            average_stars REAL,
                            friends TEXT,               -- Storing array of friends as a string
                            elite TEXT,                 -- Storing array of elite years as a string
                            compliment_hot INTEGER,    
                            compliment_more INTEGER,
                            compliment_profile INTEGER,
                            compliment_cute INTEGER,
                            compliment_list INTEGER,
                            compliment_note INTEGER,
                            compliment_plain INTEGER,
                            compliment_cool INTEGER,
                            compliment_funny INTEGER,
                            compliment_writer INTEGER,
                            compliment_photos INTEGER
                        )''')

    elif table_type == 'review':
        # Create table for reviews
        cursor.execute('''CREATE TABLE IF NOT EXISTS review_data (
                            review_id TEXT PRIMARY KEY,
                            user_id TEXT,
                            business_id TEXT,
                            stars REAL,
                            date TEXT,                   -- Format: YYYY-MM-DD HH:MM:SS
                            text TEXT,
                            sentiment_score REAL,
                            useful INTEGER,
                            funny INTEGER,
                            cool INTEGER,
                            FOREIGN KEY (user_id) REFERENCES user_data(user_id),
                            FOREIGN KEY (business_id) REFERENCES business_details(business_id)
                        )''')

    elif table_type == 'tip':
        # Create table for tips
        cursor.execute('''CREATE TABLE IF NOT EXISTS tip_data (
                            user_id TEXT,
                            business_id TEXT,
                            text TEXT,
                            date TEXT,                   -- Format: YYYY-MM-DD HH:MM:SS
                            compliment_count INTEGER,
                            FOREIGN KEY (business_id) REFERENCES business_details(business_id),
                            FOREIGN KEY (user_id) REFERENCES user_data(user_id)
                        )''')

    conn.commit()


In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']  # Compound score: -1 (negative) to +1 (positive)

Finish loading review_df
Finish sentiment


NameError: name 'insert_review_data' is not defined

In [None]:
db_path_review = '../../data/processed_data/yelp_data/yelp_review_data_new.db'
original_db_path = '../../data/processed_data/yelp_data/yelp_review_data.db'

ori_conn = sqlite3.connect(original_db_path)
conn_review = sqlite3.connect(db_path_review)

review_df = pd.read_sql_query("SELECT * FROM review_data", ori_conn)
print("Finish loading review_df")
review_df['sentiment_score'] = review_df['text'].apply(get_vader_sentiment)
print("Finish sentiment")


In [19]:
conn_review.close()

conn_review = sqlite3.connect(db_path_review)


In [20]:
cursor = conn_review.cursor()
cursor.execute('BEGIN TRANSACTION')

batch_size=10000
# Convert 'date' to string

for i in range(0, len(review_df), batch_size):
    batch = review_df.iloc[i:i + batch_size]
    cursor.executemany('''INSERT OR IGNORE INTO review_data 
                            (review_id, user_id, business_id, stars, date, text, sentiment_score, useful, funny, cool)
                            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                        batch[['review_id', 'user_id', 'business_id', 'stars', 'date', 'text', 'sentiment_score', 'useful', 'funny', 'cool']].values.tolist())

conn_review.commit()

In [14]:
insert_review_data(review_df, conn_review)

conn_review.close()
ori_conn.close()


AttributeError: 'str' object has no attribute 'strftime'

In [None]:
# Main execution
folder_path = '../../data/'
prefix_path = folder_path + 'raw_datasets/yelp/'
file_list = [
    # "sampled_yelp_academic_dataset_business.json",
    "sampled_yelp_academic_dataset_review.json",
    # "sampled_yelp_academic_dataset_user.json",
    # "sampled_yelp_academic_dataset_tip.json",
    # "sampled_yelp_academic_dataset_checkin.json"
]

# Load the datasets in chunks
df_dict = load_dataset(file_list, prefix_path)

# Split the datasets
df_business = df_dict["sampled_yelp_academic_dataset_business.json"]
df_review = df_dict["sampled_yelp_academic_dataset_review.json"]
df_user = df_dict["sampled_yelp_academic_dataset_user.json"]
df_tip = df_dict["sampled_yelp_academic_dataset_tip.json"]
df_checkin = df_dict["sampled_yelp_academic_dataset_checkin.json"]  

# Create connections for separate database files
db_path_business = '../../data/processed_data/yelp_data/yelp_business_data.db'
db_path_review = '../../data/processed_data/yelp_data/yelp_review_data.db'
db_path_user = '../../data/processed_data/yelp_data/yelp_user_data.db'
db_path_tip = '../../data/processed_data/yelp_data/yelp_tip_data.db'

Total records in sampled_yelp_academic_dataset_business.json: 78059.
Total records in sampled_yelp_academic_dataset_review.json: 980418.
Total records in sampled_yelp_academic_dataset_user.json: 229447.
Total records in sampled_yelp_academic_dataset_tip.json: 173085.
Total records in sampled_yelp_academic_dataset_checkin.json: 29190.


In [11]:
# print(df_business['attributes'][0])
# print()
# print(df_business['categories'][1])
# print()
# print(df_business['categories'][2])
df_checkin

Unnamed: 0,business_id,date
0,-O4VkRsc7V8iNaf0tjIzbQ,"2012-04-03 23:17:56, 2012-04-06 00:54:18, 2016..."
1,0VjVuzA5oLmq_-3AfaCHMw,"2011-05-02 22:50:20, 2011-05-12 23:25:11, 2011..."
2,HlXE8flli7TjYVpnzw6HjA,"2017-08-31 21:43:59, 2017-09-01 19:41:46, 2017..."
3,xa6FhYqjOl24lXNsdHBQww,"2015-04-24 18:26:03, 2015-04-24 18:31:04, 2015..."
4,OHzX-ZD9qyoeoxR8Z0dlIA,"2010-11-05 19:21:56, 2010-11-08 23:14:55, 2010..."
...,...,...
29185,TZP5x7nGae_dtS5ZTkT6VA,"2010-10-04 22:19:16, 2010-12-27 23:21:13, 2011..."
29186,shOM4z6DycBvv6c_6THdVg,"2020-02-13 20:08:13, 2020-02-13 21:37:26, 2020..."
29187,JICj3N8IsT77BIpueRJrvQ,"2011-01-21 03:27:03, 2012-05-16 19:10:43, 2012..."
29188,2Auycj-cW9QUj36fuMrKaQ,"2013-08-14 11:50:18, 2013-10-10 12:18:46, 2013..."


In [12]:
# Create connections for separate database files
db_path_business = '../../data/processed_data/yelp_data/yelp_business_data.db'
db_path_review = '../../data/processed_data/yelp_data/yelp_review_data.db'
db_path_user = '../../data/processed_data/yelp_data/yelp_user_data.db'
db_path_tip = '../../data/processed_data/yelp_data/yelp_tip_data.db'

conn_business = sqlite3.connect(db_path_business)
conn_review = sqlite3.connect(db_path_review)
conn_user = sqlite3.connect(db_path_user)
conn_tip = sqlite3.connect(db_path_tip)

# Create relevant tables in each db file
create_tables(conn_business, 'business')
create_tables(conn_review, 'review')
create_tables(conn_user, 'user')
create_tables(conn_tip, 'tip')

# Insert data in batches
insert_business_data(df_business, conn_business)
insert_review_data(df_review, conn_review)
insert_user_data(df_user, conn_user)
insert_tip_data(df_tip, conn_tip)
insert_checkin_data(df_checkin, conn_business)  

# Close connections
conn_business.close()
conn_review.close()
conn_user.close()
conn_tip.close()

print("Data has been successfully stored in the databases.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['attributes'] = batch['attributes'].apply(lambda x: str(x) if pd.notnull(x) else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['hours'] = batch['hours'].apply(lambda x: str(x) if pd.notnull(x) else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['attributes'] = batch['

Data has been successfully stored in the databases.
