# 1. 라이브러리 및 Train Data

## 데이터 Read, Merge

In [None]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tqdm import trange
encoder=LabelEncoder()
pd.set_option('display.max_columns',500)

In [None]:
user=pd.read_csv('users.csv')
movie=pd.read_csv('movies.csv')
train1=pd.read_csv('rating.csv')
train2=train1.merge(user, on='user',how='left')
train3=train2.merge(movie, on='movie',how='left')

## 각 Dummy 변수마다 Rating 평균을 보여주는 DataFrame 작성

In [None]:
movie_rating=train3.groupby('movie')['rating'].mean().reset_index().rename(columns={'rating':'rating_movie'})
genre_rating=train3.groupby('genre')['rating'].mean().reset_index().rename(columns={'rating':'rating_genre'})
user_rating=train3.groupby('user')[['rating']].mean().reset_index().rename(columns={'rating':'rating_user'})
job_rating=train3.groupby('job')['rating'].mean().reset_index().rename(columns={'rating':'rating_job'})
zip_rating=train3.groupby('zip')['rating'].mean().reset_index().rename(columns={'rating':'rating_zip'})
age_rating=train3.groupby('age')['rating'].mean().reset_index().rename(columns={'rating':'rating_age'})

## DataFrame 들 합치기

In [None]:
train4=train3.merge(age_rating, on='age', how='left')
train5=train4.merge(job_rating, on='job', how='left')
train6=train5.merge(movie_rating, on='movie', how='left')
train7=train6.merge(genre_rating, on='genre', how='left')

In [None]:
train7.describe()

## 영화별 Rating 평균 보정 - 유저 수에 따라 다른 값 부여

In [None]:
movie_user=train7.groupby(['movie'])[['user']].count().reset_index()
movie_user['count_over140']=(movie_user['user']>=140)
movie_count=movie_user.drop(['user'],axis=1)
train8=train7.merge(movie_count, on='movie', how='left')

In [None]:
temp = []
for i in trange(len(train8['count_over140'])):
    if train8['count_over140'][i]==False:
        temp.append(train8['rating_genre'][i])
    else:
        temp.append(train8['rating_movie'][i])
        
train8['rating_movie_adj'] = temp


## Train 데이터 중간 정리


In [None]:
train8.drop(['id','count_over140','rating_movie','title','sub1', 'sub2', 'sub3', 'sub4', 'sub5'],axis=1, inplace=True)
train8.head(3)

## Zip을 지역으로 변환

In [None]:
train9=train8.assign(zip_encoded=lambda x: x['zip'].astype(str))
train9['region']=train9['zip_encoded'].map(lambda x: x[0:1])
train9.drop(['zip','zip_encoded'], axis=1, inplace=True)
region_rating=train9.groupby('region')['rating'].mean().reset_index().rename(columns={'rating':'rating_region'})
train10=train9.merge(region_rating, on='region', how='left')

## 유저 별 평균 추가

In [None]:
train=train10.merge(user_rating, on='user', how='left')
train.head()

### Rating 변수들 표준화

In [None]:
# train['rating_user_mean']=train['rating_user'].mean()
# train['rating_user_std']=train['rating_user'].std()
# std=[]
# std=(train['rating_user']-train['rating_user_mean'])/train['rating_user_std']
# train['rating_user']=std

In [None]:
# train['rating_movie_mean']=train['rating_movie_adj'].mean()
# train['rating_movie_std']=train['rating_movie_adj'].std()
# std2=[]
# std2=(train['rating_movie_adj']-train['rating_movie_mean'])/train['rating_movie_std']
# train['rating_movie_adj']=std2

In [None]:
# train['rating_region_mean']=train['rating_region'].mean()
# train['rating_region_std']=train['rating_region'].std()
# std3=[]
# std3=(train['rating_region']-train['rating_region_mean'])/train['rating_region_std']
# train['rating_region']=std3

In [None]:
# train.drop(['rating_user_mean','rating_user_std','rating_movie_mean','rating_movie_std','rating_region_mean','rating_region_std'],axis=1, inplace=True)

# 2. Test Data

## 데이터 Read, Merge

In [None]:
test=pd.read_csv('test.csv')
test1=test.merge(user, on='user', how='left')
test2=test1.merge(movie, on='movie',how='left')
test3=test2.merge(age_rating, on='age', how='left')
test4=test3.merge(job_rating, on='job', how='left')

## 영화별 Rating 보정된 값 대입

In [None]:
movie_rating2=train.groupby(['movie'])['rating_movie_adj'].unique().reset_index().rename(columns={'rating_movie_adj':'rating_movie'})
movie_rating2['rating_movie']=movie_rating2['rating_movie'].astype(float)
test5=test4.merge(movie_rating2, on='movie', how='left')
test6=test5.merge(genre_rating, on='genre', how='left')

In [None]:
test6.describe()

## 영화별 Rating 평균 Null 처리

In [None]:
movie_user2=test6.groupby(['movie'])[['user']].count().reset_index()
movie_user2['count_over140']=(movie_user2['user']>=140)
movie_count2=movie_user2.drop(['user'],axis=1)
test7=test6.merge(movie_count2, on='movie', how='left')

In [None]:
temp2=[]
for i in trange(len(test7['count_over140'])):
    if test7['count_over140'][i]==False:
        temp2.append(test7['rating_genre'][i])
    else:
        temp2.append(test7['rating_movie'][i])
        
test7['rating_movie_adj'] = temp2

In [None]:
test7.describe()

In [None]:
temp3=test7['rating_movie_adj'].fillna(0)
test7['rating_movie_adj']=temp3
temp4=[]
for i in trange(len(test7['rating_movie_adj'])):
    if test7['rating_movie_adj'][i]==0:
        temp4.append(test7['rating_genre'][i])
    else:
        temp4.append(test7['rating_movie_adj'][i])
test7['rating_movie_adj']=temp4

In [None]:
test7.describe()

## Test 데이터 중간 정리

In [None]:
test7.drop(['id','count_over140','rating_movie','title','sub1', 'sub2', 'sub3', 'sub4', 'sub5'],axis=1, inplace=True)
test7.head(3)

## Zip을 지역으로 변환

In [None]:
test8=test7.assign(zip_encoded=lambda x: x['zip'].astype(str))
test8['region']=test8['zip_encoded'].map(lambda x: x[0:1])
test8.drop(['zip','zip_encoded'], axis=1, inplace=True)
test9=test8.merge(region_rating, on='region', how='left')

In [None]:
train.head()

In [None]:
test9.head()

## 유저 별 평균 & Null 처리

In [None]:
test=test9.merge(user_rating, on='user', how='left')
test.describe()

In [None]:
user_rating2=train.groupby(['sex','age','job','region'])[['rating']].mean().reset_index().rename(columns={'rating':'rating_user2'})
test_u1=test.merge(user_rating2, on=['sex','age','job','region'],how='left')
temp5=test_u1['rating_user'].fillna(0)
test_u1['rating_user']=temp5

temp6=[]
for i in trange(len(test_u1['rating_user'])):
    if test_u1['rating_user'][i]==0:
        temp6.append(test_u1['rating_user2'][i])
    else:
        temp6.append(test_u1['rating_user'][i])
test_u1['rating_user']=temp6

test_u1.drop(['rating_user2'],axis=1, inplace=True)

In [None]:
test_u1.describe()

In [None]:
user_rating3=train.groupby(['sex','age','region'])[['rating']].mean().reset_index().rename(columns={'rating':'rating_user3'})
test_u2=test_u1.merge(user_rating3, on=['sex','age','region'],how='left')
temp7=test_u2['rating_user'].fillna(0)
test_u2['rating_user']=temp7

temp8=[]
for i in trange(len(test_u2['rating_user'])):
    if test_u2['rating_user'][i]==0:
        temp8.append(test_u2['rating_user3'][i])
    else:
        temp8.append(test_u2['rating_user'][i])
test_u2['rating_user']=temp8
test_u2.drop(['rating_user3'],axis=1, inplace=True)

In [None]:
test_u2.describe()

In [None]:
user_rating4=train.groupby(['sex','age'])[['rating']].mean().reset_index().rename(columns={'rating':'rating_user4'})
test_u3=test_u2.merge(user_rating4, on=['sex','age'],how='left')
temp9=test_u3['rating_user'].fillna(0)
test_u3['rating_user']=temp9

temp10=[]
for i in trange(len(test_u3['rating_user'])):
    if test_u3['rating_user'][i]==0:
        temp10.append(test_u3['rating_user4'][i])
    else:
        temp10.append(test_u3['rating_user'][i])
test_u3['rating_user']=temp10
test=test_u3.drop(['rating_user4'],axis=1)

In [None]:
test.head()

In [None]:
train.head()

### Rating 변수들 표준화

In [None]:
# genre=genre_rating
# genre['rating_genre_mean']=genre['rating_genre'].mean()
# genre['rating_genre_std']=genre['rating_genre'].std()
# std4=[]
# std4=(genre['rating_genre']-genre['rating_genre_mean'])/genre['rating_genre_std']
# genre['rating_genre']=std4
# genre_rating=genre.drop(['rating_genre_mean','rating_genre_std'],axis=1)

In [None]:
# user2=user_rating2
# user2['rating_user2_mean']=user2['rating_user2'].mean()
# user2['rating_user2_std']=user2['rating_user2'].std()
# std5=[]
# std5=(user2['rating_user2']-user2['rating_user2_mean'])/user2['rating_user2_std']
# user2['rating_user2']=std5
# user_rating2=user2.drop(['rating_user2_mean','rating_user2_std'],axis=1)

In [None]:
# user3=user_rating3
# user3['rating_user3_mean']=user3['rating_user3'].mean()
# user3['rating_user3_std']=user3['rating_user3'].std()
# std6=[]
# std6=(user3['rating_user3']-user3['rating_user3_mean'])/user3['rating_user3_std']
# user3['rating_user3']=std6
# user_rating3=user3.drop(['rating_user3_mean','rating_user3_std'],axis=1)

In [None]:
# user4=user_rating4
# user4['rating_user4_mean']=user4['rating_user4'].mean()
# user4['rating_user4_std']=user4['rating_user4'].std()
# std7=[]
# std7=(user4['rating_user4']-user4['rating_user4_mean'])/user4['rating_user4_std']
# user4['rating_user4']=std7
# user_rating4=user4.drop(['rating_user4_mean','rating_user4_std'],axis=1)

In [None]:
# job=job_rating
# job['rating_job_mean']=job['rating_job'].mean()
# job['rating_job_std']=job['rating_job'].std()
# std8=[]
# std8=(job['rating_job']-job['rating_job_mean'])/job['rating_job_std']
# job['rating_job']=std8
# job_rating=job.drop(['rating_job_mean','rating_job_std'],axis=1)

In [None]:
# age=age_rating
# age['rating_age_mean']=age['rating_age'].mean()
# age['rating_age_std']=age['rating_age'].std()
# std9=[]
# std9=(age['rating_age']-age['rating_age_mean'])/age['rating_age_std']
# age['rating_age']=std9
# age_rating=age.drop(['rating_age_mean','rating_age_std'],axis=1)

# 3. 분석

## Dummy화 함수

In [None]:
def process_sex():
    global train
    train['sex(encoded)']=encoder.fit_transform(train['sex'])

def process_job():
    global train
    job=pd.get_dummies(train['job'], prefix='job')
    train=pd.concat([train,job], axis=1) 

def process_genre():
    global train
    genre=pd.get_dummies(train['genre'], prefix='genre')
    train=pd.concat([train,genre], axis=1)

def process_region():
    global train
    region=pd.get_dummies(train['region'], prefix='region')
    train=pd.concat([train,region], axis=1)
    
def process_age():
    global train
    age=pd.get_dummies(train['age'], prefix='age')
    train=pd.concat([train,age], axis=1)

In [None]:
def process_sex1():
    global test
    test['sex(encoded)']=encoder.fit_transform(test['sex'])
    
def process_job1():
    global test
    job=pd.get_dummies(test['job'], prefix='job')
    
    test=pd.concat([test,job], axis=1) 

def process_genre1():
    global test
    genre=pd.get_dummies(test['genre'], prefix='genre')
    
    test=pd.concat([test,genre], axis=1)
    
def process_region1():
    global test
    region=pd.get_dummies(test['region'], prefix='region')
    
    test=pd.concat([test,region], axis=1)
    

def process_age1():
    global test
    age=pd.get_dummies(test['age'], prefix='age')
    
    test=pd.concat([test,age], axis=1)

## 선형 회귀

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
#train.to_csv("train_spss.csv", index=False)

In [None]:
train_before=train
process_sex()
process_sex1()
process_job()
process_job1()
process_age()
process_age1()
process_region()
process_region1()
process_genre()
process_genre1()

In [None]:
train.columns

In [None]:
test.columns

In [None]:
feature=['sex(encoded)', 'rating_age',
       'rating_job', 'rating_genre', 'rating_movie_adj',
       'rating_region', 'rating_user']

In [None]:
target=train['rating']
train_t=train[feature]
test_t=test[feature]
train_t['constant']=1
test_t['constant']=1

In [None]:
model=LinearRegression()
model.fit(train_t,target)
result=model.predict(test_t)

In [None]:
raw=pd.read_csv('sample_submission.csv')
output=pd.DataFrame()
output['user']=raw['user']
output['rating']=result
output['id']=raw['id']
output.to_csv('output64.csv', index=False)

In [None]:
#점수: 4.31319

In [None]:
feature2=['rating', 'sex', 'age', 'job', 'genre', 'rating_age',
       'rating_job', 'rating_genre', 'rating_movie_adj', 'region',
       'rating_region', 'rating_user']
train_t2=train[feature2]

## Forward Selection....너무 느림

In [None]:
import statsmodels.formula.api as smf

def forward_selected(data, response):
   
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [None]:
model2=forward_selected(train_t2,'rating')

In [None]:
print(model2.summary())

## 랜덤 포레스트

In [None]:
forest=RandomForestClassifier(n_estimators=100,max_depth=10)

In [None]:
forest=forest.fit(train_t, target)

In [None]:
result2=forest.predict(test_t)

In [None]:
raw2=pd.read_csv('sample_submission.csv')
output = pd.DataFrame()
output['user']=raw2['user']
output['rating']=result2
output['id']=raw2['id']
output.to_csv('output51.csv', index=False)

## 로지스틱 회귀

In [None]:
lr=LogisticRegression()

In [None]:
lr=lr.fit(train_t, target)

In [None]:
result3=lr.predict(test_t)

In [None]:
raw3=pd.read_csv('sample_submission.csv')
output = pd.DataFrame()
output['user']=raw3['user']
output['rating']=result3
output['id']=raw3['id']
output.to_csv('output57.csv', index=False)