In [2]:
import psycopg2
import pprint
from pprint import pprint

conn = psycopg2.connect('dbname=yelp user=tlappas host=/var/run/postgresql')
conn.set_session(autocommit=True)
cur = conn.cursor()

for table in ['user_info', 'business', 'review']:
    cur.execute("""
        SELECT column_name, data_type, character_maximum_length 
        FROM INFORMATION_SCHEMA.COLUMNS 
        WHERE table_name = %s;
    """, (table,))
    print('{}\n'.format(table))
    pprint(cur.fetchall())
    print('')

user_info

[('user_id', 'character', 22),
 ('name', 'text', None),
 ('review_count', 'integer', None),
 ('yelping_since', 'date', None),
 ('friends', 'text', None),
 ('useful', 'integer', None),
 ('funny', 'integer', None),
 ('cool', 'integer', None),
 ('fans', 'integer', None),
 ('elite', 'text', None),
 ('average_stars', 'real', None),
 ('compliment_hot', 'integer', None),
 ('compliment_more', 'integer', None),
 ('compliment_profile', 'integer', None),
 ('compliment_cute', 'integer', None),
 ('compliment_list', 'integer', None),
 ('compliment_note', 'integer', None),
 ('compliment_plain', 'integer', None),
 ('compliment_cool', 'integer', None),
 ('compliment_funny', 'integer', None),
 ('compliment_writer', 'integer', None),
 ('compliment_photos', 'integer', None)]

business

[('business_id', 'character', 22),
 ('name', 'text', None),
 ('address', 'text', None),
 ('city', 'text', None),
 ('state', 'text', None),
 ('postal_code', 'text', None),
 ('lat', 'real', None),
 ('long', 'real',

In [3]:
# How many reviews?

cur.execute("""
    SELECT COUNT(*) 
    FROM review;
""")
print('Number of reviews: {}'.format(cur.fetchone()[0]))

# How many reviews with text? 

cur.execute("""
    SELECT COUNT(*) 
    FROM review
    WHERE (length(review.review_text) >= 1);
""")
print('Number of non-empty reviews: {}'.format(cur.fetchone()[0]))

Number of reviews: 6685900
Number of non-empty reviews: 6685900


In [4]:
# How many users?

cur.execute("""
    SELECT COUNT(*)
    FROM user_info;
""")
print('Number of users: {}'.format(cur.fetchone()[0]))

# How many users with at least one review?

cur.execute("""
    SELECT COUNT(user_id) 
    FROM user_info 
    WHERE review_count > 0;
""")
print('Number of users with at least one review: {}'.format(cur.fetchone()[0]))

# How many elite users?


Number of users: 1637138
Number of users with at least one review: 1637115


In [5]:
# How many businesses?

cur.execute("""
    SELECT COUNT(*)
    FROM business;
""")
print('Number of businesses: {}'.format(cur.fetchone()[0]))

# How many businesses with at least one review?

cur.execute("""
    SELECT COUNT(*)
    FROM business
    WHERE review_count > 0;
""")
print('Number of businesses with at least one review: {}'.format(cur.fetchone()[0]))

# How many resturants with at least one review?

cur.execute("""
    SELECT COUNT(*)
    FROM business
    WHERE review_count > 0 and categories LIKE '%Restaurants%';
""")
print('Number of resturants with at least one review: {}'.format(cur.fetchone()[0]))

# Where are they located?

cur.execute("""
    SELECT DISTINCT(city, state)
    FROM business;
""")
print('Cities with reviews:')
pprint(cur.fetchall())

Number of businesses: 192609
Number of businesses with at least one review: 192609
Number of resturants with at least one review: 59371
Cities with reviews:
[('("",AZ)',),
 ('("110 Las Vegas",NV)',),
 ('(Agincourt,ON)',),
 ('(AGINCOURT,ON)',),
 ('(Ahwahtukee,AZ)',),
 ('(Ahwatukee,AZ)',),
 ('("Ahwatukee Foothills Village",AZ)',),
 ('(Airdrie,AB)',),
 ('(Ajax,ON)',),
 ('(Akron,OH)',),
 ('(Alberta,AB)',),
 ('(Alburg,VT)',),
 ('(Alburgh,VT)',),
 ('(Aliquippa,PA)',),
 ('(Allegheny,PA)',),
 ('(Allentown,PA)',),
 ('("Allison Park",PA)',),
 ('(Ambridge,PA)',),
 ('(Amherst,OH)',),
 ('(Ange-Gardien,QC)',),
 ('(Anjou,QC)',),
 ('(Ansnorveldt,ON)',),
 ('(Anthem,AZ)',),
 ('(Antioch,CA)',),
 ('("Apache Junction",AZ)',),
 ('("Apache Trail",AZ)',),
 ('(Arizona,AZ)',),
 ('(Arnold,PA)',),
 ('(Arrowhead,AZ)',),
 ('(Ashburn,ON)',),
 ('(Aspinwall,PA)',),
 ('(Auburn,OH)',),
 ('("Auburn Township",OH)',),
 ('("Auburn Twp",OH)',),
 ('(Aurora,OH)',),
 ('(Aurora,ON)',),
 ('(Austin,TX)',),
 ('(Avalon,PA)',),
 ('(A

In [6]:
# How many users made reviews by city?

# cur.execute("""
#     SELECT business.city, business.state, review.user_id
# """)

# How many reviews per city?

cur.execute("""
    SELECT city, state, SUM(review_count)
    FROM business
    GROUP BY city, state
    ORDER BY SUM(review_count) DESC;
""")
print('Reviews in each city:')
pprint(cur.fetchall())

# How many restaurants in each city?

# How many resturant reviews in each city?



Reviews in each city:
[('Las Vegas', 'NV', 1964943),
 ('Phoenix', 'AZ', 706730),
 ('Toronto', 'ON', 510842),
 ('Scottsdale', 'AZ', 372805),
 ('Charlotte', 'NC', 299214),
 ('Pittsburgh', 'PA', 218769),
 ('Henderson', 'NV', 210596),
 ('Tempe', 'AZ', 195082),
 ('Mesa', 'AZ', 161847),
 ('Chandler', 'AZ', 151181),
 ('Montréal', 'QC', 148604),
 ('Gilbert', 'AZ', 123933),
 ('Cleveland', 'OH', 110872),
 ('Madison', 'WI', 101529),
 ('Glendale', 'AZ', 96375),
 ('Calgary', 'AB', 94827),
 ('Peoria', 'AZ', 55392),
 ('Mississauga', 'ON', 53711),
 ('Markham', 'ON', 49568),
 ('North Las Vegas', 'NV', 48944),
 ('Surprise', 'AZ', 33246),
 ('Champaign', 'IL', 29951),
 ('Goodyear', 'AZ', 27303),
 ('Richmond Hill', 'ON', 23419),
 ('Avondale', 'AZ', 21165),
 ('North York', 'ON', 19653),
 ('Scarborough', 'ON', 18438),
 ('Lakewood', 'OH', 17569),
 ('Concord', 'NC', 16150),
 ('Vaughan', 'ON', 15570),
 ('Huntersville', 'NC', 15010),
 ('Matthews', 'NC', 14814),
 ('Etobicoke', 'ON', 12285),
 ('Cave Creek', 'AZ', 