In [1]:
import sqlite3
import csv
import pandas as pd

conn = sqlite3.connect('yelp.db')
c = conn.cursor()


### Look at the review data

In [None]:
a = pd.read_csv('data/review_subset.csv')

In [None]:
for i, key in enumerate(a.columns):
    print(i, key)

### Look at the business data

In [None]:
pd.read_csv('data/business_data2.csv')

In [None]:
pd.read_csv('data/income_zipcode.csv')

# Create database structure
1. include users, business, income, review fact table
2. star schema

In [2]:
def create_database(conn):
    c = conn.cursor()

    c.execute('''DROP TABLE IF EXISTS business''')
    c.execute('''DROP TABLE IF EXISTS users''')
    c.execute('''DROP TABLE IF EXISTS income''')
    c.execute('''DROP TABLE IF EXISTS states''')
    c.execute('''DROP TABLE IF EXISTS review_fact_table''')
    
    c.execute('''
            CREATE TABLE users(
                user_id VARCHAR PRIMARY KEY, 
                review_count int,
                yelping_since DATE,
                average_stars REAL,
                UNIQUE(user_id))''')
    
    c.execute('''
            CREATE TABLE business(
                b_id VARCHAR,
                categories text,
                city text,
                is_open int,
                latitude REAL,
                longitude REAL,
                name text,
                postal_code REAL,
                pricerange int,
                review_count int,
                b_stars REAL,
                state text,
                PRIMARY KEY(b_id)
                UNIQUE(b_id) )''')
    
    c.execute('''
            CREATE TABLE income(
                zipcode INTEGER PRIMARY KEY,
                county text,
                state text, 
                avg_income REAL)
                ''')
    

    
    #create fact_table
    c.execute('''
        CREATE TABLE review_fact_table(
            review_id VARCHAR,
            user_id VARCHAR,
            business_id VARCHAR,
            zipcode int,
            review text,
            r_stars REAL,
            r_date DATE,
            PRIMARY KEY (review_id),
            FOREIGN KEY (user_id) REFERENCES users(user_id), 
            FOREIGN KEY (zipcode) REFERENCES income(zipcode),
            FOREIGN KEY (business_id) REFERENCES business(b_id))''')
    # Commit changes
    conn.commit()
# Create the database   
create_database(conn)

# Populate database
1. Users table from user_data.csv
2. Business table from business_data.csv
3. income table from income_data.csv
4. review_fact_table table from review_data2.csv 


In [3]:
def populate_database(conn):    
    #reader = csv.reader(open('review_data2.csv', encoding='latin1'))
    reader = csv.reader(open('data/business_data2.csv', encoding='latin1'))

    
    income_dict = {}
    income_id = 0
    
    next(reader, None)
    for row in reader:
        row = [item.lower() for item in row]   
        # business table
        try:
            c.execute('''
                INSERT INTO business(b_id,
                                    categories,
                                    city,
                                    is_open,
                                    latitude,
                                    longitude,
                                    name,
                                    postal_code,
                                    pricerange,
                                    review_count,
                                    b_stars,
                                    state)VALUES(?,?,?,?,?,?,?,?,?,?,?,?)''',
                      (row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11]))
        except sqlite3.Error as e:
            print("business, insert error:", e.args[0])
    conn.commit()
              
   



    # Income table    
    reader = csv.reader(open('data/income_zipcode.csv', encoding='latin1'))
    next(reader, None)
    for row in reader:
        row = [item.lower() for item in row]
        # col 0 = zip_code, col 1 = state_x, col 2 = county_x, col 3 = combine, col 4 = income
        try:
            c.execute('''
                INSERT INTO income(zipcode, county,state, avg_income)VALUES(?,?,?,?)
            ''',(row[0],row[2],row[1],row[4]))
        except sqlite3.Error as e:
            print("income, insert error:", e.args[0])
    conn.commit()
    
    
    
    
    
     # user table : col 0 = average_stars, col 1 = review_count, col 2 = user_id, col 3 = yelping_since   
    reader = csv.reader(open('data/user_data.csv', encoding='latin1'))
    next(reader, None)
    for row in reader:
        row = [item.lower() for item in row]
        try:
            c.execute('''
                INSERT INTO users(user_id, review_count, yelping_since, average_stars)VALUES(?,?,?,?)
                ''', (row[2], row[1], row[3], row[0]))
        except sqlite3.Error as e:
            print("users, insert error:", e.args[0])
    conn.commit()
                      
                      
                                         
                      
    # review fact table
    # 0 business_id
    # 1 date
    # 2 review_id
    # 3 stars
    # 4 text
    # 5 user_id
    
    # col 0 = business_id, col 1 = date, col 2 = review_id, col 3  = stars
    # col 4 = text, col 5 = user_id
    reader2 = csv.reader(open('data/review_subset.csv', encoding='latin1'))
    next(reader2, None)
    for row in reader2:
        row = [item.lower() for item in row]   
            # review table    
        try:
            c.execute('''
                INSERT INTO review_fact_table (review_id, 
                                                user_id, 
                                                business_id,
                                                zipcode,
                                                review, 
                                                r_stars, 
                                                r_date)
                SELECT ?, ?, business.b_id, income.zipcode, ?,?,?
                FROM income, business
                WHERE income.zipcode = business.postal_code AND business.b_id = ? 
                ''', (row[2], row[5], row[4], row[3], row[1], row[0]))
            
        except sqlite3.Error as e:
            print("review_fact_table2, insert error:", e.args[0])
    conn.commit()                                      

In [4]:
# create_database(conn)
populate_database(conn)

In [None]:
# for t in['business', 'income','users','review_fact_table']:
#     print('%s:'%t)
#     print(pandas.read_sql_query("SELECT * FROM %s"%t, conn), '\n')

# Initiate database & SQL

In [2]:
!pip install ipython-sql
%reload_ext sql
%sql sqlite:///yelp.db



'Connected: None@yelp.db'

In [None]:
%%sql

In [None]:
# %%sql
# UPDATE review_fact_table 
# SET income_id = 
# (SELECT income.id 
# FROM income, (SELECT business.b_id, states.state, states.county FROM states, business
# WHERE business.postal_code=states.zipcode) as z
# WHERE income.county = z.county AND income.state = z.state);

In [None]:
%%sql
SELECT business.b_id, states.state, states.county FROM states, business
WHERE business.postal_code=states.zipcode

In [8]:
%%sql
SELECT * FROM review_fact_table LIMIT 1000

Done.


review_id,user_id,business_id,zipcode,review,r_stars,r_date
byrzj8rf2kjwlr-cunu6ea,kzyloqijvyw_fwftw2rjiq,jqsnfozdpxpmourswcg1vq,85374,"this place is horrible, we were so excited to try it since i got a gift card for my birthday. we went in an ordered are whole meal and they did not except are gift card, because their system was down. unacceptable, this would have been so helpful if we would have known this prior!!",1.0,2017-06-03
i5uwuplqfplce8p2gpfwbw,wzxp9-v2dqrrjqhggrquea,jqsnfozdpxpmourswcg1vq,85374,"for being fairly ""fast"" food.. pei wei (pronounced pay way i confirmed haha) is pretty darn good. we got a few things to share. i had the asian chicken salad and was impressed! there was a decent amount of chicken. some more veggies would be nice, but overall pretty good. the steak teriyaki was great as well as the fried rice. over all good was good! nice, clean, and reasonable.",4.0,2015-03-26
eyqyvttg2jx4or9bb8pc9g,xylt12exfdlii_3udlvipw,jqsnfozdpxpmourswcg1vq,85374,"i decided to try it out, i'm celiac and therefore can only eat gluten free... they have an easy to understand gf menu with anything you can possibly want. i placed my order online and picked the exact pickup time. i cam and my food was ready for me. driving home the smell in my car was so good i could barely wait to get home and try it - true umami! i got home and dug into the delicious spicy chicken and rice with sugar snapies and carrots. it was superb! for $9 i will definitely try this again! i did see a huge line at the store, so try doing an online order and pickup forsure! p.s. they even include gf soy sauce!",5.0,2012-12-30
g-efa005besj5uhsh0sqfa,ji9peffxjwqplo7pefspkq,jqsnfozdpxpmourswcg1vq,85374,"i'm not saying pei wei is the best asian food i've ever tasted, far from it, it's a fairly large chain that puts on the appearance of something more refined, it's essentially to asian food what olive garden is to italian food. with that said i've always had pretty good experiences with pei wei, the food although not spectacular is better than some of the overcooked chicken drowning in msg offered by some of the local chinese restaurants. the portions are good sized, the food is generally consistent, and the prices are really reasonable considering this is a corporate chain in some cases cheaper than the local establishments. or dare i say it's name ""panda express"" (which is overpriced crap) the time before last that i went they forgot the tomato's and the dressing for an asian chopped chicken salad that my wife ordered, i didn't discover that the dressing was missing until i got home, i immediately called the restaurant and was speaking to a manager within 30 seconds. the manager apologized and asked me if i would like to come back to the restaurant or if he could have my address so he could send me a gift certificate. i decided to go back to the restaurant, when i got the restaurant i told the person at the counter my name and they already had a bag set aside for me, the manager came over and explained to me there was another full salad in the bag, and he put additional dressing for the salad we already had, and additionally he gave me a coupon for free lettuce wraps.(which mental note: i need to use) i must say i was impressed with this manager, and it was refreshing after being in situations where a manager has taken back the bag/plate and essentially ""un-f$@k's"" your food and returns it to you. overall great customer service, consistent food, and a good option for takeout in surprise.",3.0,2009-01-12
6pcjsgubsljt4vlxos5c4a,tliwzajpret0zx4_vgvlhg,jqsnfozdpxpmourswcg1vq,85374,sometimes the food is spot on and delicious and other times it is quite salty at this location. very difficult to get a consistently good meal. menu items add up quickly.,3.0,2015-07-11
pfjmyzd_lnba_y3kbx1vva,jzeitnwbwmv6mooxycaamq,jqsnfozdpxpmourswcg1vq,85374,decent customer service but the food was awful. it was cold and had no sauce at all. i was expecting it to be good but this place really went down hill. i will never eat here again.,1.0,2015-05-27
_qv1fqutolrkmug6pv4gzw,e56svqt5-owfsejjrma8_w,jqsnfozdpxpmourswcg1vq,85374,"super clean restaurant and friendly staff. fresh food. hasn't been sitting under heat lamps. no msg, this is the good stuff. i have to have the kung pao chicken weekly.",5.0,2015-02-28
s2mlqrfnapegtcneu3ej4q,4wyico4emeca9r7spyqkbw,jqsnfozdpxpmourswcg1vq,85374,"found this the other night. it is the pf chang fast food option and it worked perfectly for us. limited menu, but lower prices. very basic decor, but clean and fast seating. lettuce wraps just as good as chang's. very busy, especially the take out. glad to have it close",4.0,2010-04-05
oiszzrrbi3y01_wqu528zq,p8mvj7azwjtffh5fxbbmug,jqsnfozdpxpmourswcg1vq,85374,"the staff here is great and they're nice, wonderful and quick. people were ranting in raving about pei wei, i had to try it. even good yelp reviews. i'm highly dissatisfied with the flavor of the food. this should be labeled asian inspired and not asian. i've tried a variety of chinese restaurants, this doesn't taste close to anything i've had at other asian restaurants. their mongolian beef was 5 pieces of beef and large mushrooms cut into thirds in a thick sauce. you eat the rice to wash off the nasty flavor. my shrimp was thickly coated in an overpowering sauce as well. i only ate some of the veggies that take center stage on a meat dish. the center of my pork egg roll was cold. the hot n sour soup was a much thicker consistency almost like that of a chili instead of being brothy. worst of all was the price. this was not worth it to us. neither me or my husband enjoyed either of our dishes. we didn't even eat half of our plates. we even refused to take it home with us. if you like and enjoy what typical asian food tastes like, don't waste your time here.",1.0,2015-05-22
4bpjre9vi0hhyzzyyyv0bq,7y4nebqqwg7j-tvrqi6uzq,jqsnfozdpxpmourswcg1vq,85374,i had the garlic ginger broccoli chicken and it was not very good. the broccoli was hardly cooked and the sauce was way to sweet. everything else was great. i will give them a few more tries before i write them off as another crappy asian restaurant in surprise.,2.0,2011-06-15


In [7]:
%%sql
SELECT DISTINCT zipcode FROM review_fact_table

Done.


zipcode
85374
44060
15217
85008
28262
89108
44130
44103
44114
44224


# Try SQL

In [9]:
%%sql
SELECT count(r.review_id) FROM business 
JOIN review_fact_table r on r.business_id = business.b_id
where business.state = 'nv'

Done.


count(r.review_id)
772691


In [10]:
%%sql
SELECT count(b_id) from business
where state = 'pa'

Done.


count(b_id)
2641


In [11]:
%%sql
SELECT count(user_id) from users

Done.


count(user_id)
1183362


In [12]:
%%sql
SELECT count(avg_income) from income

Done.


count(avg_income)
4742


In [13]:
%%sql
SELECT distinct state from income

Done.


state
pa
nc
oh
il
az
nv


In [15]:
%%sql
SELECT COUNT(distinct business.b_id) FROM business, review_fact_table r WHERE r.zipcode = business.postal_code 

Done.


COUNT(distinct business.b_id)
20678


In [16]:
%%sql
SELECT COUNT(distinct business.b_id) FROM business

Done.


COUNT(distinct business.b_id)
21380


In [None]:
%%sql
SELECT distinct business.b_id FROM business, review_fact_table r WHERE r.zipcode != business.postal_code

Done.


In [None]:
%%sql
SELECT categories FROM business WHERE categories LIKE '%fast food%' LIMIT 50

In [None]:
%%sql
SELECT distinct income_id FROM review_fact_table

In [None]:
%%sql
SELECT * FROM business LIMIT 50

In [None]:
%%sql
SELECT business_id FROM review_fact_table LIMIT 10

In [None]:
%%sql
SELECT distinct business.state FROM business, review_fact_table 
WHERE review_fact_table.business_id = business.b_id LIMIT 50

In [None]:
#import pandas as pd
#sql = "SELECT * FROM TABLE"

#df = pd.read_sql(sql, cnxn)