這是一個用來推薦中餐要吃哪間餐廳的模型, 使用的資料包含用餐者有哪些人, 前六天的用餐紀錄, 天氣等資料.

config 裡面定義用餐人選與可能用餐的餐廳, 若 data 裡面的人名與餐廳不在 config 設定的名單裡面會丟出 exception

# Configure

In [0]:
import numpy as np
import pandas as pd

g_config = {
    "members": [
        'Jack',
        'Kelly',
        'Mark',
        'Jason',
        'Alin',
        'Tom',
        'Tony',
        'Sophie'
    ],
    'restaurants': [
        'Tokyo Susi',
        'Haka Restaurant',
        'Beef noodle',
        'Pizza',
        'Li Chinese Food',
        'Holy Coffee'
    ]
}

# weather: 1晴天, 2很熱, 3很冷, 4快下雨, 5雨天
g_real_data = [
    [['Jack', 'Alin', 'Tom', 'Tony', 'Sophie'], 'Pizza', 1],
    [['Jack', 'Kelly', 'Mark', 'Alin', 'Tom', 'Tony', 'Sophie'], 'Beef noodle', 1]
]

g_training_data = [
    [['Jack', 'Alin', 'Tom', 'Tony', 'Sophie'], 'Pizza', 1],
    [['Alin', 'Tom', 'Kelly'], 'Haka Restaurant', 1],
    [['Jack', 'Jason', 'Tom', 'Tony', 'Sophie'], 'Tokyo Susi', 1],
    [['Kelly', 'Tony', 'Tom', 'Kelly'], 'Pizza', 1],

    [['Jack', 'Kelly', 'Mark', 'Alin', 'Tom', 'Tony'], 'Beef noodle', 1],
    [['Jack', 'Kelly', 'Jason', 'Alin', 'Tom', 'Tony', 'Sophie'], 'Beef noodle', 1],

    [['Kelly', 'Tom'], 'Li Chinese Food', 1],
    [['Tom', 'Tony'], 'Li Chinese Food', 2],
    [['Kelly', 'Tom'], 'Li Chinese Food', 1],
    [['Tony', 'Jack'], 'Li Chinese Food', 4],
    [['Kelly', 'Tom'], 'Li Chinese Food', 1],

    [['Kelly', 'Mark', 'Alin', 'Tom', 'Tony'], 'Holy Coffee', 3],
    [['Jack', 'Alin', 'Tom', 'Tony', 'Sophie'], 'Holy Coffee', 3],
    [['Alin', 'Tom', 'Tony', 'Sophie'], 'Holy Coffee', 3],
    [['Jack', 'Alin', 'Tom', 'Tony', 'Sophie'], 'Holy Coffee', 3],
    [['Alin', 'Tom', 'Tony', 'Sophie'], 'Holy Coffee', 3],
    [['Jack', 'Mark', 'Alin', 'Tom', 'Tony'], 'Holy Coffee', 3]
]

g_testing_data = [
    [['Jack', 'Alin', 'Tony', 'Sophie'], 'Pizza', 1],
    [['Kelly', 'Tom'], 'Li Chinese Food', 4],
    [['Kelly', 'Jack', 'Tom'], 'Li Chinese Food', 1],
    [['Jack', 'Alin', 'Tom', 'Sophie'], 'Pizza', 2],
    [['Kelly', 'Mark', 'Alin', 'Tom', 'Tony', 'Jack'], 'Beef noodle', 1],
    [['Kelly', 'Mark', 'Alin', 'Tom', 'Tony', 'Jack', 'Jason'], 'Beef noodle', 1],
    [['Kelly', 'Alin', 'Tom', 'Tony', 'Jason'], 'Holy Coffee', 3],
    [['Mark', 'Alin', 'Tom', 'Tony'], 'Haka Restaurant', 1]
]


# ETL



In [13]:
from collections import deque
from sklearn.preprocessing import label_binarize
from sklearn import preprocessing

g_members = []
g_restaurants = []
g_le_of_restaurants = preprocessing.LabelEncoder()

def initialize(data):
  members = set()
  restaurants = set()
  
  for record in data:
    if record[1] not in g_config['restaurants']:
      raise Exception('Unknown restaurant %s' % (record[1]))
    
    restaurants.add(record[1])

    for member in record[0]:
      if member not in g_config['members']:
        raise Exception('Unknown member %s' % (member))

      members.add(member)

  global g_members
  global g_restaurants
  g_members = list(members)
  g_restaurants = list(restaurants)

  global g_le_of_restaurants
  g_le_of_restaurants.fit(g_restaurants)
  

def rest_to_index(rest):
  return g_le_of_restaurants.transform([rest])[0]

def index_to_rest(idx):
  return g_le_of_restaurants.inverse_transform([idx])[0]

def generate_member_features(members):
  r = label_binarize(members, classes=g_members)
  return pd.DataFrame([np.max(r, axis=0)], columns=g_members)

def generate_restaurant_features(history):
  all = []
  idx = 0
  for rest in history:
    x = label_binarize([rest], classes=g_restaurants)

    # generate column name
    idx += 1
    cols = []
    for c in g_restaurants:
      cols.append('%s_%d' % (c, idx))
    
    all.append(pd.DataFrame(x, columns=cols))
    
  return pd.concat(all, axis=1)

def process_one_record(record, history):
  (member, restaurant, weather) = record
  
  member_features = generate_member_features(member)
  
  restauratn_features = generate_restaurant_features(history)
  
  others = pd.DataFrame(
      [[weather, len(member), rest_to_index(restaurant)]], 
      columns=['weather', 'count', 'answer']
  )

  return pd.concat([member_features, restauratn_features, others], axis=1)

def load_data(data):
  features = process_one_record(data[0], g_history)
  
  for record in data[1:]:
    f = process_one_record(record, g_history)
    features = features.append(f)
    g_history.popleft()
    g_history.append(record[1])
    
  
  weather_features = pd.get_dummies(features['weather'])
  features = features.drop('weather', axis=1)
  
  count_features = pd.get_dummies(features['count'])
  features = features.drop('count', axis=1)
  
  features = pd.concat([features, weather_features, count_features], axis=1)

  
  return features

initialize(g_training_data)
g_history = deque([0,0,0,0,0,0])
g_training = load_data(g_training_data)
g_testing = load_data(g_testing_data)


  mask |= (ar1 == a)


# Training


In [17]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

g_model = None

def train():
  y = g_training['answer'].values
  X = g_training.drop('answer', 1)

  model = LogisticRegression()
  model = model.fit(X, y)

  ty = g_testing['answer'].values
  tX = g_testing.drop('answer', 1)

  p = model.predict(tX)

  acc = model.score(tX, ty)
  f1 = f1_score(ty, p, average='weighted')
  prec = precision_score(ty, p, average='weighted')
  recall = recall_score(ty, p, average='weighted')
  confm = confusion_matrix(ty, p)

  global g_model
  g_model = model
  
  print ('acc:%f\nf1:%f\nprec:%f\nrecall:%f' % (acc, f1, prec, recall))

train()


acc:0.625000
f1:0.550000
prec:0.531250
recall:0.625000


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [18]:

def predict(records, history):
  ys = []
  fs = None
  for record in records:
    feature = process_one_record(record, history)
    ys.append(feature['answer'])
    feature = feature.drop('answer', 1)
    if fs is None:
      fs = feature
    else:
      fs = fs.append(feature)

  weather_features = pd.get_dummies(fs['weather'])
  fs = fs.drop('weather', axis=1)
  
  count_features = pd.get_dummies(fs['count'])
  fs = fs.drop('count', axis=1)
  
  fs = pd.concat([fs, weather_features, count_features], axis=1)
  
  ps = g_model.predict(fs)
  
  return (ys, ps)
  
def test(data):
  history = [r[1] for r in data[-6:]]
  (ys, ps) = predict(data, history)
  for (y, p, d) in zip(ys, ps, data):
    print ('%d %2d : %2d   (%s : %s) - %s' % (y == p, y, p, index_to_rest(y), index_to_rest(p), str(d)))

test(g_testing_data)
#test(g_training_data)

0  4 :  2   (['Pizza'] : Holy Coffee) - [['Jack', 'Alin', 'Tony', 'Sophie'], 'Pizza', 1]
1  3 :  3   (['Li Chinese Food'] : Li Chinese Food) - [['Kelly', 'Tom'], 'Li Chinese Food', 4]
1  3 :  3   (['Li Chinese Food'] : Li Chinese Food) - [['Kelly', 'Jack', 'Tom'], 'Li Chinese Food', 1]
0  4 :  2   (['Pizza'] : Holy Coffee) - [['Jack', 'Alin', 'Tom', 'Sophie'], 'Pizza', 2]
1  0 :  0   (['Beef noodle'] : Beef noodle) - [['Kelly', 'Mark', 'Alin', 'Tom', 'Tony', 'Jack'], 'Beef noodle', 1]
1  0 :  0   (['Beef noodle'] : Beef noodle) - [['Kelly', 'Mark', 'Alin', 'Tom', 'Tony', 'Jack', 'Jason'], 'Beef noodle', 1]
1  2 :  2   (['Holy Coffee'] : Holy Coffee) - [['Kelly', 'Alin', 'Tom', 'Tony', 'Jason'], 'Holy Coffee', 3]
0  1 :  2   (['Haka Restaurant'] : Holy Coffee) - [['Mark', 'Alin', 'Tom', 'Tony'], 'Haka Restaurant', 1]


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
