In [73]:
from flask import Flask, render_template, request, redirect, url_for, flash, session
from pymongo import MongoClient
import pandas as pd
import bcrypt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import joblib

kmeans_model = joblib.load('kmeans_model.pkl')
scaler = joblib.load('min_max_scaler.pkl')
pca = joblib.load('pca_transformer.pkl')

app = Flask(__name__)
app.secret_key = 'your_secret_key'

# MongoDB 클라이언트 설정
client = MongoClient('mongodb://localhost:27017/')
db = client['web_project']
user_col = db['user_profile']
user_basic_col = db['user']  # 추가된 컬렉션
other_col = db['web_project']





https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [74]:
user_profile = user_col.find_one({}, {"_id": 0, "password": 0, "email": 0})
df = pd.DataFrame([user_profile])


In [75]:
df

Unnamed: 0,name,age,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,job,pets,religion,smokes
0,박병윤,42,male,straight,fit,anything,socially,never,university,white,178,science,likes both,agnosticism,no


In [76]:
df['sex'] = 'm'

In [77]:
df

Unnamed: 0,name,age,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,job,pets,religion,smokes
0,박병윤,42,m,straight,fit,anything,socially,never,university,white,178,science,likes both,agnosticism,no


In [78]:
# other_col 컬렉션에서 _id, password, email 필드를 제외한 모든 문서를 선택합니다.
all_profiles = list(other_col.find({}, {"_id": 0, "password": 0, "email": 0}))

# 선택된 문서들로부터 DataFrame을 생성합니다.
df_all = pd.DataFrame(all_profiles)


In [79]:
df_all.drop('sign', axis = 1, inplace = True)
df_all.drop('membership', axis = 1, inplace = True)

In [80]:
combined_df = pd.concat([df, df_all], ignore_index=True)
combined_df.drop('name', axis =1 , inplace = True)

In [81]:
encoded_profiles = combined_df.copy()

columns_to_encode = ['sex', 'orientation', 'body_type', 
                    'diet', 'drinks', 'drugs', 'education', 'ethnicity', 
                    'job', 'pets', 'religion', 'smokes']

for column in columns_to_encode:
    encoded_cols = pd.get_dummies(combined_df[column], prefix=column, drop_first=True)
    encoded_profiles = pd.concat([encoded_profiles, encoded_cols], axis=1)
    encoded_profiles.drop(column, axis=1, inplace=True)
encoded_profiles

Unnamed: 0,age,height,sex_m,orientation_gay,orientation_straight,body_type_big,body_type_fit,body_type_other,body_type_skinny,diet_halal,...,religion_christianity,religion_hinduism,religion_islam,religion_judaism,religion_other,smokes_sometimes,smokes_trying to quit,smokes_unknown,smokes_when drinking,smokes_yes
0,42,178,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22,190,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,35,178,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,38,173,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,23,180,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59942,59,157,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
59943,24,183,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59944,42,180,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
59945,27,185,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [82]:
col_to_scale = ['age', 'height']
X = encoded_profiles[col_to_scale]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
encoded_profiles[col_to_scale] = X_scaled


encoded_profiles.drop(['sex_m'], axis=1, inplace=True)

encoded_profiles.dropna(inplace=True)
pca = PCA(n_components=46) 
X_pca = pca.fit_transform(encoded_profiles)

k = 10
model = KMeans(n_clusters = k, n_init='auto')
model.fit(X_pca)

combined_df['membership'] = model.labels_

In [86]:
combined_df[:1]

Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,job,pets,religion,smokes,membership
0,42,m,straight,fit,anything,socially,never,university,white,178,science,likes both,agnosticism,no,9


In [87]:
users = combined_df.loc[(combined_df.sex == 'm') & 
                     (combined_df.membership == combined_df.at[1, 'membership']) & 
                     (combined_df.orientation == 'gay')].index
print(f'And so we have found {len(users)} male users in the same cluster.\n ')

And so we have found 0 male users in the same cluster.
 


In [89]:
# 유클리디안 거리를 구하는 공식 
def distance(row, user):
    result = 0
    for i, v in enumerate(row):
        result += (v - user[i])**2
    return result ** 0.5; 

In [90]:
df = pd.DataFrame(X_pca)
user = df.loc[1]
distances = df.loc[users].apply(distance, axis = 1, args=(user,) ).sort_values();

In [91]:
combined_df['height'] = combined_df['height'].astype('int')
combined_df.loc[combined_df.index, :].head(10)

Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,job,pets,religion,smokes,membership
0,42,m,straight,fit,anything,socially,never,university,white,178,science,likes both,agnosticism,no,9
1,22,m,straight,avg,anything,socially,never,university,asian,190,transportation,likes both,agnosticism,sometimes,9
2,35,m,straight,avg,other,often,sometimes,space camp,white,178,hospitality,likes both,agnosticism,no,1
3,38,m,straight,skinny,anything,socially,unknown,masters degree,unknown,173,other,cat person,other,no,7
4,23,m,straight,skinny,vegetarian,socially,unknown,university,white,180,student,cat person,other,no,8
5,29,m,straight,fit,other,socially,never,university,asian,168,artistic,likes both,other,no,8
6,29,m,straight,avg,anything,socially,unknown,university,white,170,computer,cat person,atheism,no,9
7,32,f,straight,fit,anything,socially,never,university,white,165,other,likes both,other,unknown,8
8,31,f,straight,avg,anything,socially,never,university,white,165,artistic,likes both,christianity,no,9
9,24,f,straight,other,anything,socially,unknown,university,white,170,other,likes both,christianity,when drinking,9
