In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression

##########데이터 로드

train_df = pd.read_excel('https://github.com/cranberryai/todak_todak_python/blob/master/machine_learning/regression/carprice_E1SUl6b.xlsx?raw=true', sheet_name='train')
test_df = pd.read_excel('https://github.com/cranberryai/todak_todak_python/blob/master/machine_learning/regression/carprice_E1SUl6b.xlsx?raw=true', sheet_name='test')

##########데이터 분석

##########데이터 전처리

x_train_df = train_df.drop(['가격'], axis=1)
x_test_df = test_df.drop(['가격'], axis=1)
y_train_df = train_df['가격']
y_test_df = test_df['가격']

print(x_train_df.head())
'''
     년식  종류    연비   마력    토크   연료  하이브리드   배기량    중량 변속기
0  2015  대형   6.8  159  23.0  LPG      0  2359  1935  수동
1  2012  소형  13.3  108  13.9  가솔린      0  1396  1035  자동
2  2015  중형  14.4  184  41.0   디젤      0  1995  1792  자동
3  2015  대형  10.9  175  46.0   디젤      0  2497  2210  수동
4  2015  대형   6.4  159  23.0  LPG      0  2359  1935  자동
'''
print(x_train_df.columns) #Index(['년식', '종류', '연비', '마력', '토크', '연료', '하이브리드', '배기량', '중량', '변속기'], dtype='object')

transformer = make_column_transformer(
    (OneHotEncoder(), ['종류', '연료', '변속기']),
    remainder='passthrough')
transformer.fit(x_train_df)
x_train = transformer.transform(x_train_df)
x_test = transformer.transform(x_test_df)

y_train = y_train_df.values
y_test = y_test_df.values

##########모델 학습

model = LinearRegression()

model.fit(x_train, y_train)

##########모델 검증

print(model.score(x_test, y_test)) #0.7739730315245023

##########모델 예측

x_test = transformer.transform(pd.DataFrame([
    [2016, '대형', 6.8, 159, 25, 'LPG', 0, 2359, 1935, '수동']
], columns=['년식', '종류', '연비', '마력', '토크', '연료', '하이브리드', '배기량', '중량', '변속기']))
print(x_test)
'''
[[1.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00
  1.000e+00 0.000e+00 2.016e+03 6.800e+00 1.590e+02 2.500e+01 0.000e+00
  2.359e+03 1.935e+03]]
'''

y_predict = model.predict(x_test)
print(y_predict) #[1802.160302088625]
print(y_predict[0]) #1802.160302088625

     년식   종류    연비   마력    토크   연료  하이브리드   배기량    중량 변속기
0  2015  준중형  11.8  172  21.0  가솔린      0  1999  1300  자동
1  2015  준중형  12.3  204  27.0  가솔린      0  1591  1300  자동
2  2015   소형  15.0  100  13.6  가솔린      0  1368  1035  수동
3  2014   소형  14.0  140  17.0  가솔린      0  1591  1090  자동
4  2015   대형   9.6  175  46.0   디젤      0  2497  1990  자동
Index(['년식', '종류', '연비', '마력', '토크', '연료', '하이브리드', '배기량', '중량', '변속기'], dtype='object')
0.7739730315245057
[[1.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00
  1.000e+00 0.000e+00 2.016e+03 6.800e+00 1.590e+02 2.500e+01 0.000e+00
  2.359e+03 1.935e+03]]
[1802.16030209]
1802.1603020886687


In [7]:
import pandas as pd 
import itertools 

df = pd.read_csv("marketbasket.csv", index_col=0)

def support(df, item_lst):
    return (df[list(item_lst)].sum(axis=1)==len(item_lst)).mean()

def make_all_set_over_support(df, support_threshold):
    items = []
    single_items = [col for col in df.columns if support(df, [col]) > support_threshold] # size 1 items
    
    size = 2
    while True:
        new_items = []
        for item_cand in itertools.combinations(single_items, size):
            #print(item_cand, (df[list(item_cand)].sum(axis=1)==size).mean())
            if support(df, list(item_cand)) > support_threshold:
                new_items.append(list(item_cand))
        if len(new_items)==0:
            break
        else:
            items+=new_items
            size+=1
    items += [ [s] for s in single_items]# 이렇게 해줘야 모든 type이 list가 됨
    return items

def make_confidence_lst(df, item_set_over_support, confidence_threshold):
    r_lst = []
    for item1 in item_set_over_support:
        for item2 in item_set_over_support:
            if len(set(item1).intersection(set(item2)))==0:
                conf = support(df, list(set(item1).union(set(item2))))/ support(df, item1)
                if conf > confidence_threshold:
                    r_lst.append((item1, item2, conf))
            else:
                continue
    return sorted(r_lst, key=lambda x: x[2], reverse=True)

def make_lift_lst(df, item_set_over_support, lift_threhsold):
    r_lst = []
    for item1 in item_set_over_support:
        for item2 in item_set_over_support:
            if len(set(item1).intersection(set(item2)))==0:
                lift = support(df, list(set(item1).union(set(item2))))
                lift /= support(df, item1)
                lift /= support(df, item2)
                if lift > lift_threhsold:
                    r_lst.append((item1, item2, lift))
            else:
                continue
    return sorted(r_lst, key=lambda x: x[2], reverse=True)

over_support_lst = make_all_set_over_support(df, 0.07)# 0.05로 하면 두 개짜리도 나옴. 로 하면 3개 짜리도 나옴
print("over support list")
print(over_support_lst)
print("-----------------")
print("over confidence list")
for a, b, conf in  make_confidence_lst(df, over_support_lst, 0.53):
    print("{} => {}: {}".format(a, b, conf))
print("-----------------")
print("over lift list")
for a, b, lift in  make_lift_lst(df, over_support_lst, 5.6):
    print("{} => {}: {}".format(a, b, lift))
import pandas as pd print("-----------------")

over support list
[[' 98pct. Fat Free Hamburger'], [' Onions'], [' Potato Chips'], [' Hot Dogs'], [' 2pct. Milk'], [' Eggs'], [' White Bread'], [' Cola'], [' Toothpaste'], [' Hamburger Buns'], [' Wheat Bread'], [' Sweet Relish'], [' Toilet Paper']]
-----------------
over confidence list
[' Hamburger Buns'] => [' 98pct. Fat Free Hamburger']: 0.6804123711340206
[' Toothpaste'] => [' White Bread']: 0.6018518518518519
[' Toothpaste'] => [' Eggs']: 0.5648148148148148
[' Wheat Bread'] => [' White Bread']: 0.5619047619047619
[' Wheat Bread'] => [' 2pct. Milk']: 0.5523809523809524
[' Sweet Relish'] => [' Hot Dogs']: 0.5517241379310344
[' Toothpaste'] => [' 2pct. Milk']: 0.5462962962962963
[' Onions'] => [' White Bread']: 0.5321100917431193
-----------------
over lift list
[' Hamburger Buns'] => [' 98pct. Fat Free Hamburger']: 7.291663284357497
[' 98pct. Fat Free Hamburger'] => [' Hamburger Buns']: 7.291663284357496
[' Hot Dogs'] => [' Sweet Relish']: 5.9594964422550625
[' Sweet Relish'] => [' 

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet


import warnings; warnings.simplefilter('ignore')


link_small = pd.read_csv('links_small.csv')
link_small = link_small[link_small['tmdbId'].notnull()]['tmdbId'].astype('int') 

md = pd. read_csv('movies_metadata.csv')

md = md.drop([19730, 29503, 35587]) 
md['id'] = md['id'].astype('int') 
smd = md[md['id'].isin(link_small)]
smd.shape

(9099, 24)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
tfidf_matrix.shape



(9099, 268124)

In [4]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]


#데이터 정제 중 

smd = smd.reset_index()
titles = smd['title']
indces = pd.Series(smd.index, index=titles)


#smd에 인덱스를 포함하고 타이틀을 만든다. pd.Series를 통해서 타이틀을 인덱스로 하고 indces를 만듭니다. 

def getrecommandations(title):
    index = indces[title]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores] 
    return titles.iloc[movie_indices]


list(enumerate(cosine_sim[0]))
getrecommandations('The Dark Knight').head(10)



7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

In [7]:
from math import sqrt 

critics={
     'hhd':{'guardians of the galaxy 2':5,'christmas in august':4,'boss baby':1.5},
     'chs':{'christmas in august':5,'boss baby':2},
     'kmh':{'guardians of the galaxy 2':2.5,'christmas in august':2,'boss baby':1},
     'leb':{'guardians of the galaxy 2':3.5,'christmas in august':4,'boss baby':5}
}

def sim(i,j):
    return sqrt(pow(i,2)+pow(j,2))

for i in critics:
    if i!='chs':
        num1 = critics.get('chs').get('christmas in august')- critics.get(i).get('christmas in august')
        num2 = critics.get('chs').get('boss baby')- critics.get(i).get('boss baby')
        print(i," : ", 1/(1+sim(num1,num2))) #정규화

hhd  :  0.4721359549995794
kmh  :  0.2402530733520421
leb  :  0.2402530733520421


In [8]:
def sim_distance(data, name1, name2):
    sum=0
    for i in data[name1]:
        if i in data[name2]: #같은 영화를 봤다면
            sum+=pow(data[name1][i]- data[name2][i],2)
        
    return 1/(1+sqrt(sum))


def top_match(data, name, index=3, sim_function=sim_distance):
    li=[]
    for i in data:
        if name!=i: #자기 자신은 제외한다
            li.append((sim_function(data,name,i),i)) # 유사도, 이름을 튜플에 묶어 리스트에 추가한다
    li.sort() #오름차순 정렬
    li.reverse() #내림차순 정렬
    
    return li[:index]

top_match(critics, 'chs')
# 하나의 함수로 여러개의 아이템을 한 번에 비교했고, 
# chs과 hhd의 거리가 가장 가까운 것을 확인할 수 있다. 

[(0.4721359549995794, 'hhd'),
 (0.2402530733520421, 'leb'),
 (0.2402530733520421, 'kmh')]