# 以創意擇優對新聞進行可信度評價

# 基本假設
1. 可信度定義：1 ~ 5。
2. 每個使用者的評分都是理性的
3. 所有新聞被所有使用者評分
4. 所有評分被所有使用者評價

TODO 另寫一篇文章，來講解這裡的東西，用小論文的格式，放在readme

### 變數定義
變數名稱|中文|說明
:----|:----:|----
reporter|報導者|
news|新聞|
user|使用者|
review|評分|對新聞打分數
reviewer|評分者|對新聞打分數的使用者
judge|評價|對評分打分數
judger|評價者|對評分打分數的使用者
score|分數|
weight|權重|即為可信度

In [None]:
# 初始化
import pandas as pd
import numpy as np

num = {
    'reporter' : 5,
    'news_per_reporter' : 30,
    'user' : 13,
    'user_weight_window' : 10 # 權重計算最長筆數
}

# 評分分數範圍
class Scale:
    
    def __init__(self, min = 1, max = 5):
        self.min = min
        self.max = max
        self.mean = (min + max) / 2
        
    def arange(self):
        return np.arange(self.min, self.max + 1)

scale = Scale()

def index_as_id(data_frame, suffix):
    
    data_frame[suffix + '_id'] = data_frame.index
    
    columns = data_frame.columns.tolist()
    columns = columns[-1:] + columns[:-1]
    data_frame = data_frame[columns]
    
    return data_frame

### 產生測試資料

In [None]:
# 使用者
def get_users(num):
    users = pd.DataFrame({
        'user_id' : np.arange(num['user'])
    })
    return users

users = get_users(num)
users.head()

In [None]:
# 管理使用者權重
class UserWeightHolder:

    past_weights = {}
    weights = {}
    
    def __init__(self, users, init_mean, user_weight_window = 10):
        self.users = users
        self.user_weight_window = user_weight_window
        self.init_mean = init_mean
        
        self.reset()
    
    def reset(self):
        self.past_weights = {user_id: np.repeat(self.init_mean, self.user_weight_window).tolist() for user_id in self.users['user_id']}
        self.weights = {user_id: self.init_mean for user_id in self.users['user_id']}
    
    def get(self, user_id):
        return self.weights[user_id]
    
    def inserts(self, new_weights):
        for user_id, new_weight in new_weights.items():
            
            self.past_weights[user_id].pop(0)
            self.past_weights[user_id].append(new_weight)
            
            # TODO 簡單平均數、加權平均數、做對事情，權重越高
            self.weights[user_id] = sum(self.past_weights[user_id]) / self.user_weight_window
        
    def print(self):
        print('weights:', self.weights)
        print('past_weights:', self.past_weights)

user_weight_holder = UserWeightHolder(users, scale.mean, num['user_weight_window'])
user_weight_holder.print()

In [None]:
class WeightCalculator:
    
    def __init__(self, xp, fp):
        self.xp = xp
        self.fp = fp

    def cal(self, x):
        return np.interp(x, self.xp, self.fp)
    
weight_calculator = WeightCalculator([scale.min**2, scale.max**2], [scale.min, scale.max])

In [None]:
# 報導者
def get_reporters(num):
    reporters = pd.DataFrame({
        'bvlty': np.random.randn(num['reporter'])
    })
    return index_as_id(reporters, 'reporter')

reporters = get_reporters(num)

reporters.head()

In [None]:
# 新聞
# X新聞可信度定義為：依據報導者的可信度產生的可信度 + 媒體偏好度
# 每個新聞都有一個真實的評分

def get_news(num, scale, reporters):
    
    # 報導者column
    reporters = list(reporters['reporter_id']) * num['news_per_reporter']

    # 找出每則新聞報導者的可信度
    # authors_bvlty = [reporters.iloc[author_index].bvlty for author_index in authors]

    # 定義每則新聞的可信度，因為與報導者相關，定義為 N(author.bvty, 1)，所以後面再加上報導者的可信度
    # news_bvlty = np.random.randn(num_news) + authors_bvlty

    # 假設每則新聞都有一個真實評分
    # TODO 可以與報導者的素質做連結?
    news_real_scores = np.random.choice(scale.arange(), len(reporters))

    # TODO 視覺化一下，是否可信度高的報導者會擁有可信度高的新聞

    # 新聞
    news = pd.DataFrame({
        'reporter_id' : reporters,
        'news_real_score' : news_real_scores
    })
    
    return index_as_id(news, 'news')

news = get_news(num, scale, reporters)

news.head()

In [None]:
#使用者對新聞的評分
def get_reviews(num, scale, reviewers, news):

    # 每個評分者要評分的每則新聞
    pairs = [[reviewer_id, n.news_id, n.news_real_score] for reviewer_id in reviewers['user_id'] for n in news.itertuples(index = False)]
    reviews = pd.DataFrame(pairs, columns = ['reviewer_id', 'news_id', 'news_real_score'])

    # 每個使用者對每則新聞的評分
    reviews['review_score'] = np.random.choice(scale.arange(), len(pairs))

    return index_as_id(reviews, 'review')

reviews = get_reviews(num, scale, users, news)

reviews.head()

In [None]:
#使用者對"評分"的評價
def get_judges(num, scale, reviewers, reviews, news):
    
    # 每個評價者要評價的每則評分
    pairs = [[review.review_id, review.review_score, review.news_real_score, judger_id] for review in reviews.itertuples(index = False) for judger_id in reviewers['user_id']]
    judges = pd.DataFrame(pairs, columns = ['review_id', 'review_score', 'news_real_score', 'judger_id'])

    # 評價公式，評分的分數與實際的分數越準確，評價分數越高(假設所有使用者都是理性的)
    judging = lambda review_score, real_score: scale.max - abs(review_score - real_score)
    
    # 計算每則評分的評價分數
    judges['judge_score'] = [judging(judge.review_score, judge.news_real_score) for judge in judges.itertuples(index = False)]
    
    return judges

judges = get_judges(num, scale, users, reviews, news)

judges.head()

In [None]:
# 合併成dataset
cols_to_use = lambda right, left: left.columns.difference(right.columns)

data = judges.merge(reviews[cols_to_use(judges, reviews)], left_on = 'review_id', right_index = True)
data = data.merge(news[cols_to_use(data, news)], left_on = 'news_id', right_index = True)
data['judger_weight'] = [user_weight_holder.get(judger_id) for judger_id in data['judger_id']]

# 排除自己評自己的
data = data[data.judger_id != data.reviewer_id]

data.head()

## 開始計算可信度 / 權重

In [None]:
# 計算每個使用者的可信度/權重

# 我想知道，這個judge的評價加權後是多少，所以分數要乘以某個權重，權重是倍數的概念，來自於之前做了多少對的事

# 使用者的權重= (其他使用者對這個使用者的評分的評價 * 其他使用者的權重) 的 加權平均
# weight(user) = sum(score(judge_u1)*weight(user1) + score(judge_u2)*weight(user2) + ... + score(judge_uN)*weight(userN)) / sum(weights)

# 先計算每個使用者的新權重
judger_weight_averaging = lambda x: np.average(x['judge_score'], weights = x.judger_weight)
judger_weights = data.groupby('judger_id').apply(judger_weight_averaging)

# 更新data的使用者權重
user_weight_holder.inserts(judger_weights.to_dict())
user_weight_holder.print()

In [None]:
# 計算新聞的可信度/權重

# 每個新聞的可信度等於 使用者對該新聞的評分 * 該使用者的可信度權重 的 加權平均。
# weight(news) = sum[score(review_u1)*weight(user1) + score(review_u2)*weight(user2) + ..... + score(review_uN)*weight(userN)] / sum(weight(users))

# 先計算每個review的加權後分數 = 使用者的權重 * 使用者的評分
data['review_score_weighted'] = [user_weight_holder.get(d.reviewer_id) * d.review_score for d in data.itertuples()]
#data['review_score_weighted'] = [weight_calculator.cal(review_score) for review_score in data['review_score_weighted']]

# 計算每個新聞的可信度 / 權重
# news_weights = data.groupby('news_id', as_index = False)['review_score_weighted'].mean()

# 在data加上新聞的可信度
#news_weights.rename(columns = {'review_score_weighted': 'news_weight'}, inplace = True)
#data = data.merge(news_weights, left_on = 'news_id', right_on = 'news_id',  how = 'left').sort_index()

#data.describe()

In [None]:
data.groupby('news_id')['news_weight'].describe()

In [None]:
# 計算記者的可信度 / 權重

# 每個記者的可信度來自於所寫新聞的可信度簡單平均數
# weight(reporter) = avg[weight(news1) + weight(news2) + .... + weight(newsN)]
reporter_weight = data.groupby('reporter_id', as_index = False)['news_weight'].mean()
reporter_weight.rename(columns = {'news_weight' : 'reporter_weight'}, inplace = True)
reporter_weight.head(20)