### 1. Data Preprocessing 
#### ① Basic preprocessing

도서관에서 제공받은 데이터를 예측에 사용하기 위해 Data Preprocessing(데이터 전처리)을 진행했고, 본 페이지에서는 전처리 중 가장 먼저 이루어지는 단계이다. **base_preprocess** 를 통해 열 이름을 재정의하고, 예측에 필요하지 않은 데이터들은 걸러냈다. 그 후, **make_student_table**, **make_book_table**에서 차후에 이루어질 전처리 단계를 위한 도서 정보 테이블과 학생 테이블을 return 했다. 

*열 이름 재정의, school_dict의 세부 내용은 데이터 보호를 위해 생략한다.*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
def base_preprocess(data, start_date, end_date, record_cnt):
    
    #1. rename columns & revalue student number id 
    data.rename(columns = {'original_column_names' : 'new_column_names'
                          }, inplace=True)
    
    data['student_id'] = data['student_id']*10000000000
    data['student_id'] = data['student_id'].astype('int')
    data['student_id'] = data['student_id'].astype('str')
    
    
    #2. delete except useful columns
    data = data[['book_id', 'student_id','book_loc', 'book_loc_2', 'title', 
                 'book_ddc', 'borrow_date', 'return_date', 
                 'borrow_state','borrow_type','return_type','seoji_num','major']]
    
    
    #3. return_type : return type delete of lost
    data = data[data['return_type'] != '분실신고반납']

    #4. borrow_state : leave only return complete  
    data = data[data['borrow_state'] == '반납완료']
    
    #5. borrow_type : only leave correct borrowing 
    data = data[data['borrow_type'].isin(['일반대출', '예약도서대출', '무인대출반납']) == True]

    #6. major: delete weird majors 
    data = data[data['major'].isin(['중앙도서관', '학술정보지원팀', '외부이용자']) == False]

    #7. book: delete thesis, workbooks and dvd using book locations
    data = data[~data['book_loc_2'].isin(['book_locations'])]
    
    #8. books: which does not have ddc call numbers
    data = data[data['book_ddc'].isnull() != True]
    
    
    #9. student: with more than 2 different borrow records
    data_grp = data.groupby('student_id').count()
    data_grp = data_grp[['borrow_date']]
    data_grp.sort_values('borrow_date', inplace = True)
    over_record_cnt = data_grp[data_grp['borrow_date'] >= record_cnt]
    
    filtered_data = data[data['student_id'].isin(over_record_cnt.index)]

    
    #10. filtering by borrow and return date
    filtered_data = filtered_data[(filtered_data['borrow_date'] >= start_date) & (filtered_data['return_date'] < end_date)]
    
    #11. remain columns for prediction
    filtered_data = filtered_data[['book_id', 'student_id', 'title', 'book_ddc', 'major']]
    
    filtered_data.reset_index(inplace = True, drop = True)
    return filtered_data

def make_student_table(data):
    student_major = data[['student_id', 'major']]
    student_major = student_major.drop_duplicates(['student_id'], keep='first')
    student_major.sort_values('student_id', inplace=True)
    student_major.reset_index(inplace=True, drop=True)
    
    ## generalizing level of major as a colleage unit
    college_dict = {'school' : 'major_list'}
    
    def find_college(major):
        college_list = sorted(list(college_dict.keys()))
        
        for key, value in college_dict.items():
            if major in value:
                return [key, college_list.index(key)]
            
            
    temp = student_major['major'].apply(lambda x: find_dept(x), axis = 1, result_type = 'expand')
    student = pd.concat([student_major, temp], axis = 1)
    student = student[student['college'].isnull() != True]
    student.reset_index(inplace = True, drop = True)
    
    return student


def make_book_table(data):
    book_df = data[['book_id', 'title', 'book_ddc']]
    book_df = book_df.drop_duplicates(['book_id'], keep='first')
    book_df.sort_values('book_id', inplace=True)
    book_df.reset_index(inplace = True, drop = True)
    
    return book_df


In [None]:
if __name__ == '__main__':
    raw_data = pd.read_csv('../tot_1519.csv')
    data = base_preprocess(raw_data, '2019-01-01', '2020-01-01', record_cnt = 5)
    student_df = make_student_table(data)
    
    data = data[data['student_id'].isin(student_df['student_id']) == True]
    book_df = make_book_table(data)
    
    ## save df
    #student_df.to_csv('student.csv')
    #book_df.to_csv('book.csv')
    #data.to_csv('data.csv')