In [None]:
#!/usr/bin/env python
# coding: utf-8

###################################################################
#                                                                 #
#   2024 DS2 Database Project : Recommendation using SQL-Python   #
#                                                                 #
###################################################################

import mysql.connector
from tabulate import tabulate
import pandas as pd
import math
import sys

## Connect to Remote Database
## Insert database information


HOST = "147.46.15.238" # database 를 생성/연결할 서버 IP
PORT = "7000"
USER = "DS2024_0046"
PASSWD = "DS2024_0046"
DB = "DS_proj_19"

connection = mysql.connector.connect(
    host=HOST,
    port=7000,
    user=USER,
    passwd=PASSWD,
    db=DB,
    autocommit=True  # to create table permanently
)

cur = connection.cursor(dictionary=True)

## 수정할 필요 없는 함수입니다.
# DO NOT CHANGE INITIAL TABLES IN prj.sql
def get_dump(mysql_con, filename):
    '''
    connect to mysql server using mysql_connector
    load .sql file (filename) to get queries that create tables in an existing database (fma)
    '''
    query = ""
    try:
        with mysql_con.cursor() as cursor:
            for line in open(filename, 'r'):
                if line.strip():
                    line = line.strip()
                    if line[-1] == ";":
                        query += line
                        cursor.execute(query)
                        query = ""
                    else:
                        query += line

    except Warning as warn:
        print(warn)
        sys.exit()


## 수정할 필요 없는 함수입니다.
# SQL query 를 받아 해당 query를 보내고 그 결과 값을 dataframe으로 저장해 return 해주는 함수
def get_output(query):
    cur.execute(query)
    out = cur.fetchall()
    df = pd.DataFrame(out)
    return df


# [Algorithm 1] Popularity-based Recommendation - 1 : Popularity by rating count
def popularity_based_count(user_input=True, item_cnt=None):
    if user_input:
        rec_num = int(input('Number of recommendations?: '))
    else:
        assert item_cnt is not None
        rec_num = int(item_cnt)
    print(f"Popularity Count based recommendation")
    print("=" * 99)

    # TODO: remove sample, return actual recommendation result as df
    # YOUR CODE GOES HERE !
    # 쿼리의 결과를 sample 변수에 저장하세요.
    sample = [(x, 5.0-0.1*x) for x in range(rec_num)]

    # do not change column names
    df = pd.DataFrame(sample, columns=['item', 'count'])

    # TODO end

    # Do not change this part
    with open('pbc.txt', 'w') as f:
        f.write(tabulate(df, headers=df.columns, tablefmt='psql', showindex=False))
    print("Output printed in pbc.txt")


# [Algorithm 1] Popularity-based Recommendation - 2 : Popularity by average rating
def popularity_based_rating(user_input=True, item_cnt=None):
    if user_input:
        rec_num = int(input('Number of recommendations?: '))
    else:
        assert item_cnt is not None
        rec_num = int(item_cnt)
    print(f"Popularity Rating based recommendation")
    print("=" * 99)

    # TODO: remove sample, return actual recommendation result as df
    # YOUR CODE GOES HERE !
    # 쿼리의 결과를 sample 변수에 저장하세요.
    sample = [(x, 5.0-0.1*x) for x in range(rec_num)]

    # do not change column names
    df = pd.DataFrame(sample, columns=['item', 'prediction'])
    # TODO end

    # Do not change this part
    with open('pbr.txt', 'w') as f:
        f.write(tabulate(df, headers=df.columns, tablefmt='psql', showindex=False))
    print("Output printed in pbr.txt")


# [Algorithm 2] Item-based Recommendation
def ibcf(user_input=True, user_id=None, rec_threshold=None, rec_max_cnt=None):
    if user_input:
        user = int(input('User Id: '))
        rec_cnt = int(input('Recommend Count: '))
        rec_num = float(input('Recommendation Threshold: '))
    else:
        assert user_id is not None
        assert rec_max_cnt is not None
        assert rec_threshold is not None
        user = int(user_id)
        rec_cnt = int(rec_max_cnt)
        rec_num = float(rec_threshold)

    print("=" * 99)
    print(f'Item-based Collaborative Filtering')
    print(f'Recommendations for user {user}')

    # TODO: remove sample, return actual recommendation result as df
    # YOUR CODE GOES HERE !
    # 쿼리의 결과를 sample 변수에 저장하세요.
    sample = [(user, 50-x, x/10)
              for x in range(50, math.ceil(rec_num * 10) - 1, -1)]

    # do not change column names
    df = pd.DataFrame(sample, columns=['user', 'item', 'prediction'])
    # TODO end

    # Do not change this part
    with open('ibcf.txt', 'w') as f:
        f.write(tabulate(df, headers=df.columns, tablefmt='psql', showindex=False))
    print("Output printed in ibcf.txt")



# [Algorithm 3] (Optional) User-based Recommendation
def ubcf(user_input=True, user_id=None, rec_threshold=None, rec_max_cnt=None):
    if user_input:
        user = int(input('User Id: '))
        rec_cnt = int(input('Recommend Count: '))
        rec_num = float(input('Recommendation Threshold: '))
    else:
        assert user_id is not None
        assert rec_max_cnt is not None
        assert rec_threshold is not None
        user = int(user_id)
        rec_cnt = int(rec_max_cnt)
        rec_num = float(rec_threshold)

    print("=" * 99)
    print(f'User-based Collaborative Filtering')
    print(f'Recommendations for user {user}')

    # TODO: remove sample, return actual recommendation result as df
    # YOUR CODE GOES HERE !
    # 쿼리의 결과를 sample 변수에 저장하세요.
    sample = [(user, 50-x, x/10)
              for x in range(50, math.ceil(rec_num * 10) - 1, -1)]

    # do not change column names
    df = pd.DataFrame(sample, columns=['user', 'item', 'prediction'])
    # TODO end

    # Do not change this part
    with open('ubcf.txt', 'w') as f:
        f.write(tabulate(df, headers=df.columns, tablefmt='psql', showindex=False))
    print("Output printed in ubcf.txt")


## 수정할 필요 없는 함수입니다.
# Print and execute menu 
def menu():
    print("=" * 99)
    print("0. Initialize")
    print("1. Popularity Count-based Recommendation")
    print("2. Popularity Rating-based Recommendation")
    print("3. Item-based Collaborative Filtering")
    print("4. User-based Collaborative Filtering")
    print("5. Exit database")
    print("=" * 99)

    while True:
        m = int(input("Select your action : "))
        if m < 0 or m > 5:
            print("Wrong input. Enter again.")
        else:
            return m

def execute(argv):
    terminated = False
    while not terminated:
        if len(argv)<2:
            m = menu()
            if m == 0:
                # 수정할 필요 없는 함수입니다.
                # Upload prj.sql before this
                # If autocommit=False, always execute after making cursor
                get_dump(connection, 'prj.sql')
            elif m == 1:
                popularity_based_count()
            elif m == 2:
                popularity_based_rating()
            elif m == 3:
                ibcf()
            elif m == 4:
                ubcf()
            elif m == 5:
                terminated = True
            

        # 평가를 위한 코드입니다. 수정하지 마세요.
        else:
            with open(argv[1], 'r') as f:
                lines = f.readlines()
                for line in lines:
                    rec_args = list(map(float, line.split(',')))
                    if len(rec_args) > 1:
                        rec_args[1] = int(rec_args[1])
                    m = rec_args[0]
                    if m==0:
                        get_dump(connection, 'prj.sql')
                    elif m == 1:
                        popularity_based_count(False, *rec_args[1:])
                    elif m == 2:
                        popularity_based_rating(False, *rec_args[1:])
                    elif m == 3:
                        ibcf(False, *rec_args[1:])
                    elif m == 4:
                        ubcf(False, *rec_args[1:])
                    elif m == 5:
                        terminated = True
                    else:
                        print('Invalid menu option')

# DO NOT CHANGE
if __name__ == "__main__":
    execute(sys.argv)


In [54]:
import mysql.connector
from tabulate import tabulate
import pandas as pd
import numpy as np
import math
import sys

## Connect to Remote Database
## Insert database information


HOST = "147.46.15.238" # database 를 생성/연결할 서버 IP
PORT = "7000"
USER = "DS2024_0046"
PASSWD = "DS2024_0046"
DB = "DS_proj_19"

connection = mysql.connector.connect(
    host=HOST,
    port=7000,
    user=USER,
    passwd=PASSWD,
    db=DB,
    autocommit=True  # to create table permanently
)

cur = connection.cursor(dictionary=True)

In [55]:
def get_output(query):
    cur.execute(query)
    out = cur.fetchall()
    df = pd.DataFrame(out)
    return df

In [56]:
def popularity_based_rating(user_input=True, item_cnt=None):
    if user_input:
        rec_num = int(input('Number of recommendations?: '))
    else:
        assert item_cnt is not None
        rec_num = int(item_cnt)
    print(f"Popularity Rating based recommendation")
    print("=" * 99)

    # TODO: remove sample, return actual recommendation result as df
    # YOUR CODE GOES HERE !
    # 쿼리의 결과를 sample 변수에 저장하세요.
    sample = [(x, 5.0-0.1*x) for x in range(rec_num)]

    # do not change column names
    df = pd.DataFrame(sample, columns=['item', 'prediction'])
    # TODO end

    # Do not change this part
    with open('pbr.txt', 'w') as f:
        f.write(tabulate(df, headers=df.columns, tablefmt='psql', showindex=False))
    print("Output printed in pbr.txt")

In [30]:
rec_num = 10
get_output(
    f'''
    select item, count from
    (
        select item, count(rating) as count
        from ratings
        where rating is not null
        group by item
    ) a
    where item >= 150
    and item < 350
    order by count desc, item
    limit {rec_num}
    '''
)

Unnamed: 0,item,count
0,203,194
1,232,187
2,200,167
3,233,155
4,208,153
5,227,152
6,244,152
7,172,151
8,280,149
9,234,148


In [5]:
rec_num = 10
get_output(
    f'''
    select item, round(avg(rating),4) avg from
    (
        select r.user, r.item, ((r.rating - a.min_r) / (a.max_r - a.min_r)) as rating from ratings r
        left outer join 
        (select user, min(rating) min_r, max(rating) max_r 
        from ratings
        group by user) a
        on a.user = r.user
        where rating is not null
        and item >= 150
        and item < 350
    ) a
    group by item
    order by avg desc
    limit {rec_num}
    '''
)

Unnamed: 0,item,avg
0,316,0.8666
1,203,0.8634
2,307,0.8347
3,290,0.8321
4,310,0.8311
5,306,0.8268
6,295,0.8265
7,157,0.8244
8,292,0.8227
9,344,0.8163


In [6]:
rec_num = 10
get_output(
    f'''
    select item, round(avg(rating),4) avg from
    (
        select r.user, r.item, ((r.rating - a.min_r) / (a.max_r - a.min_r)) as rating from ratings r
        left outer join 
        (select user, min(rating) min_r, max(rating) max_r 
        from ratings
        group by user) a
        on a.user = r.user
        where rating is not null
        and item >= 150
        and item < 350
    ) a
    group by item
    order by avg desc
    limit {rec_num}
    '''
)

Unnamed: 0,item,avg
0,316,0.8666
1,203,0.8634
2,307,0.8347
3,290,0.8321
4,310,0.8311
5,306,0.8268
6,295,0.8265
7,157,0.8244
8,292,0.8227
9,344,0.8163


In [22]:
# IBCF
rec_num = 4
df_sim = get_output(
    f'''
       SELECT a.item_1, a.item_2, a.sim / SUM(a.sim) OVER (PARTITION BY item_1) as per_sim
        FROM (
        SELECT 
            item_1, 
            item_2, 
            sim,
            ROW_NUMBER() OVER (PARTITION BY item_1 ORDER BY sim DESC, item_2 ASC) AS rn
        FROM item_similarity
        ) a
        WHERE a.rn <= 5
    '''
)
# df_sim = df_sim.pivot(index='item_1', columns='item_2', values='per_sim').fillna(0)
# df_sim = df_sim.reindex(columns=range(453), fill_value=0)
df_sim


Unnamed: 0,item_1,item_2,per_sim
0,0,3,0.20000
1,0,7,0.20000
2,0,8,0.20000
3,0,9,0.20000
4,0,10,0.20000
...,...,...,...
2260,452,1,0.20000
2261,452,5,0.20000
2262,452,6,0.20000
2263,452,7,0.20000


In [55]:
df = get_output(
'''
SELECT 
    a.item,
    a.user,
    COALESCE(a.rating, avg_rating) AS rating
FROM (
 SELECT 
        user,
        item,
        rating,
        round(AVG(rating) OVER (PARTITION BY item),4) AS avg_rating
    FROM ratings
) a
order by a.item, a.user
'''
)
# df = df.pivot(index='item', columns='user', values='rating')
df

Unnamed: 0,item,user,rating
0,0,0,4.0000
1,0,1,4.0000
2,0,2,3.3641
3,0,3,3.0000
4,0,4,3.3641
...,...,...,...
132271,452,287,3.2188
132272,452,288,3.2188
132273,452,289,3.2188
132274,452,290,3.0000


In [62]:
df = get_output(
'''
    select user, item, predict from
    (select c.user, c.item, round(sum(c.rating * c.per_sim),4) as predict from
    (SELECT 
        a.user,
        b.item_1 item,
        COALESCE(a.rating, avg_rating) AS rating,
        b.per_sim
    FROM (
    SELECT 
            user,
            item,
            rating,
            AVG(rating) OVER (PARTITION BY item) AS avg_rating
        FROM ratings
    ) a
    left outer join 
    (SELECT a.item_1, a.item_2, a.sim / SUM(a.sim) OVER (PARTITION BY item_1) as per_sim
            FROM (
            SELECT 
                item_1, 
                item_2, 
                sim,
                ROW_NUMBER() OVER (PARTITION BY item_1 ORDER BY sim DESC, item_2 ASC) AS rn
            FROM item_similarity
            ) a
            WHERE a.rn <= 5) b
    on a.item = b.item_2) c
    group by c.user, c.item
    ) d
    where user = 5
    and    not exists (select *
                              from ratings
                              where user = d.user
                              and   item = d.item
                              and   rating is not null)
    order by predict desc, item
'''
)

df.head()

Unnamed: 0,user,item,predict
0,5,439.0,4.6
1,5,440.0,4.4
2,5,109.0,4.2243
3,5,211.0,4.0609
4,5,449.0,3.9721


In [72]:
df = get_output(
'''
 
    
    select a.user_1, a.user_2, round(a.sim / sum(a.sim) over (partition by a.user_1),4) as per_sim from (
            SELECT 
                user_1, user_2, sim, row_number() over (partition by user_1 order by sim desc, user_2) as rn
            FROM user_similarity) a
    where a.rn <= 5
'''
)

df

Unnamed: 0,user_1,user_2,per_sim
0,0,35,0.2000
1,0,36,0.2000
2,0,40,0.2000
3,0,49,0.2000
4,0,54,0.2000
...,...,...,...
1455,291,59,0.2000
1456,291,189,0.2000
1457,291,201,0.2000
1458,291,202,0.2000


In [88]:
df = get_output(
'''
select user, item, predict from(
select c.item, c.user, round(sum(c.rating * c.per_sim),4) as predict from
(
SELECT 
        a.item,
        b.user_1 user,
        COALESCE(a.rating, avg_rating) AS rating,
        b.per_sim
    FROM (
        SELECT 
            user,
            item,
            rating,
            AVG(rating) OVER (PARTITION BY user) AS avg_rating
        FROM ratings ) a
left outer join
(select a.user_1, a.user_2, round(a.sim / sum(a.sim) over (partition by a.user_1),4) as per_sim from (
            SELECT 
                user_1, user_2, sim, row_number() over (partition by user_1 order by sim desc, user_2) as rn
            FROM user_similarity) a
    where a.rn <= 5) b
on a.user = b.user_2
) c
group by c.item, c.user ) d
where user = 5
and    not exists (select *
                            from ratings
                            where user = d.user
                            and   item = d.item
                            and   rating is not null)
order by predict desc, item
'''
)

df

Unnamed: 0,user,item,predict
0,5,152,4.1948
1,5,440,4.1
2,5,102,4.0757
3,5,213,3.9161
4,5,449,3.9142
5,5,103,3.909
6,5,437,3.8583
7,5,444,3.8382
8,5,109,3.8109
9,5,439,3.78
