# 以創意擇優對新聞進行可信度評價

# 基本假設
1. 可信度定義：1 ~ 5。
2. 每個使用者的評分都是理性的
3. 所有新聞被所有使用者評分
4. 所有評分被所有使用者評價

TODO 另寫一篇文章，來講解這裡的東西，用小論文的格式，放在readme

### 變數定義
變數名稱|中文|說明
:----|:----:|----
reporter|報導者|
news|新聞|
reader|讀者|
review|評分|對新聞打分數
reviewer|評分者|對新聞打分數的讀者
judge|評價|對評分打分數
judger|評價者|對評分打分數的讀者
score|分數|
weight|權重|即為可信度

### 產生測試資料

#### 引入套件及資料筆數

In [None]:
'''
初始化
'''
import pandas as pd
import numpy as np

num = {
    
    # 報導者人數
    'reporter' : 5,
    
    # 每個報導者生產新聞數
    'news_per_reporter' : 80,
    
    # 讀者人數
    'reader' : 20,
    
    # 讀者舊權重保留筆數
    'reader_pass_weight_window' : 10 
}

#### 分數範圍管理

In [None]:
class Scale:
    
    def __init__(self, min = 1, max = 10):
        self.min = min
        self.max = max
        self.mean = (min + max) / 2
        self.sigma = (self.max - self.mean) / 3
        
    def arange(self):
        return np.arange(self.min, self.max + 1)

    def translateZ(self, z_score):
        translated = z_score * self.sigma + self.mean
        return max(min(translated, self.max), self.min)
    
scale = Scale()

#### Helper Functions

In [None]:
# 讓index做為id，並獨立存在同一欄內
def index_as_id(data_frame, suffix):
    
    data_frame[suffix + '_id'] = data_frame.index
    
    columns = data_frame.columns.tolist()
    columns = columns[-1:] + columns[:-1]
    data_frame = data_frame[columns]
    
    return data_frame

#### 讀者

In [None]:
def get_readers(num):
    readers = pd.DataFrame({
        'reader_id' : np.arange(num['reader'])
    })
    return readers

readers = get_readers(num)
readers.head()

#### 讀者權重管理

In [None]:
class ReaderWeightHolder:
    
    # 目前的權重
    weights = {}
    
    # 過去的權重
    past_weights = {}

    def __init__(self, readers, init_mean, reader_pass_weight_window = 10):
        self.readers = readers
        self.reader_pass_weight_window = reader_pass_weight_window
        self.init_mean = init_mean

        self.reset()

    def reset(self):
        self.past_weights = {reader_id: np.repeat(self.init_mean, self.reader_pass_weight_window).tolist() for reader_id in self.readers['reader_id']}
        self.weights = {reader_id: self.init_mean for reader_id in self.readers['reader_id']}

    def get(self, reader_id):
        return self.weights[reader_id]

    def inserts(self, new_weights):
        
        for reader_id, new_weight in new_weights.items():  
            self.past_weights[reader_id].pop(0)
            self.past_weights[reader_id].append(new_weight)
            # TODO 簡單平均數、加權平均數、做對事情，權重越高
            self.weights[reader_id] = sum(self.past_weights[reader_id]) / self.reader_pass_weight_window

    def print_weights(self):
        return pd.DataFrame.from_dict(reader_weight_holder.weights, orient = 'index', columns = ['weight']).rename_axis('user_id').head()
    
    def print_past_weights(self):
        return pd.DataFrame.from_dict(reader_weight_holder.past_weights, orient = 'index').rename_axis('user_id').head()

    def print_all(self):
        print(self.weights)
        print(self.past_weights)
        
reader_weight_holder = ReaderWeightHolder(readers, scale.mean, num['reader_pass_weight_window'])

In [None]:
reader_weight_holder.print_weights()

In [None]:
reader_weight_holder.print_past_weights()

#### 報導者

In [None]:
def get_reporters(num):
    
    # 常態分佈下的分數機率
    z_scores = np.random.randn(num['reporter'])
    
    # 換算成實際分數
    scores = [scale.translateZ(z_score) for z_score in z_scores]
    
    reporters = pd.DataFrame({
        'reporter_score': scores,
        'reporter_z_scores': z_scores
    })

    return index_as_id(reporters, 'reporter')

reporters = get_reporters(num)
reporters.head()

#### 新聞

In [None]:
def get_news(num, scale, reporters):
    
    # 報導者column
    reporter_ids = list(reporters['reporter_id']) * num['news_per_reporter']
    
    #新聞報導者的可信度z分數
    reporter_z_scores = list(reporters['reporter_z_scores']) * num['news_per_reporter']
    
    # 定義每則新聞的可信度z分數，因為與報導者相關，定義為 N(author.bvty, 1)，所以後面再加上報導者的可信度
    news_z_scores = np.random.randn(len(reporter_ids)) + reporter_z_scores
    
    # 新聞的真實可信度
    news_scores = [scale.translateZ(news_z_score) for news_z_score in news_z_scores]
    
    # TODO 視覺化一下，是否可信度高的報導者會擁有可信度高的新聞

    news = pd.DataFrame({
        'reporter_id' : reporter_ids,
        'news_score' : news_scores
    })
    
    return index_as_id(news, 'news')

news = get_news(num, scale, reporters)
news.head()
#news.groupby('reporter_id')['news_score'].describe()

#### 評分結果

In [None]:
def get_reviews(num, scale, reviewers, news):

    # 先準備每個評分者要評分的每則新聞
    prepared = [[reviewer_id, n.news_id, n.news_score] for reviewer_id in reviewers['reader_id'] for n in news.itertuples(index = False)]
    reviews = pd.DataFrame(prepared, columns = ['reviewer_id', 'news_id', 'news_score'])

    # TODO 可以與讀者的可信度做連結?
    # TODO 視覺化一下，是否可信度高的讀者會做出可信度高的評分
    
    # 每個讀者對每則新聞的評分
    reviews['review_score'] = np.random.choice(scale.arange(), len(prepared))

    return index_as_id(reviews, 'review')

reviews = get_reviews(num, scale, readers, news)
reviews.head()

#### 評價結果

In [None]:
def get_judges(num, scale, reviewers, reviews, news):
    
    # 每個評價者要評價的每則評分
    pairs = [[review.review_id, review.review_score, review.news_score, judger_id] for review in reviews.itertuples(index = False) for judger_id in reviewers['reader_id']]
    judges = pd.DataFrame(pairs, columns = ['review_id', 'review_score', 'news_score', 'judger_id'])

    # 評價公式，評分的分數與實際的分數越準確，評價分數越高(假設所有讀者都是理性的)
    judging = lambda review_score, real_score: scale.max - abs(review_score - real_score)
    
    # 計算每則評分的評價分數
    judges['judge_score'] = [judging(judge.review_score, judge.news_score) for judge in judges.itertuples(index = False)]
    
    return index_as_id(judges, 'judge')

judges = get_judges(num, scale, readers, reviews, news)
judges.head()

#### 合併在一起，方便後續測試

In [None]:
def get_data(news, reviews, judges):
    cols_to_use = lambda right, left: left.columns.difference(right.columns)

    data = judges.merge(reviews[cols_to_use(judges, reviews)], left_on = 'review_id', right_index = True)
    data = data.merge(news[cols_to_use(data, news)], left_on = 'news_id', right_index = True)
    data['judger_weight'] = [reader_weight_holder.get(judger_id) for judger_id in data['judger_id']]

    # 排除自己評自己的
    data = data[data.judger_id != data.reviewer_id]

    return data

data = get_data(news, reviews, judges)
data.head()

In [None]:
# 每個新聞的評價分數平均
import matplotlib.pyplot as plt

v = data.groupby('news_id')['judge_score'].mean()
plt.hist(v, density = True)

## 開始計算可信度 / 權重

In [None]:
# 計算每個讀者的可信度/權重
def get_reviewer_weights(data):
    # 我想知道，這個judge的評價加權後是多少，所以分數要乘以某個權重，權重是倍數的概念，來自於之前做了多少對的事
    # 這個評價分數代表評分的品質，評分的品質，也代表評分者的品質，因為每個judge_score代表其他讀者對自己的評價

    # 讀者的權重= (其他讀者對這個讀者的評分的評價 * 其他讀者的權重) 的 加權平均
    # weight(user) = sum(score(judge_u1)*weight(user1) + score(judge_u2)*weight(user2) + ... + score(judge_uN)*weight(userN)) / sum(weights)

    # 先計算每個讀者的新權重
    judger_weight_averaging = lambda x: np.average(x['judge_score'], weights = x.judger_weight)
    reviewer_weights = data.groupby('reviewer_id').apply(judger_weight_averaging)
    
    return reviewer_weights

reviewer_weights = get_reviewer_weights(data)
reviewer_weights

In [None]:
# 更新讀者權重
def update_reader_weight(reviewer_weights, data):

    reader_weight_holder.inserts(reviewer_weights.to_dict())
    data['reviewer_weight'] = [reader_weight_holder.get(reviewer_id) for reviewer_id in data['reviewer_id']]
    
update_reader_weight(reviewer_weights, data)
reader_weight_holder.print_all()

In [None]:
# 計算新聞的可信度/權重
def get_news_weight(data):
    # 每個新聞的可信度等於 讀者對該新聞的評分 * 該讀者的可信度權重 的 加權平均。
    # weight(news) = sum[score(review_u1)*weight(user1) + score(review_u2)*weight(user2) + ..... + score(review_uN)*weight(userN)] / sum(weight(users))

    # 計算每個記者的可信度
    review_weight_averaging = lambda x: np.average(x['review_score'], weights = x.reviewer_weight)
    news_weights = data.groupby('news_id').apply(review_weight_averaging)

    return news_weights

news_weights = get_news_weight(data)
news_weights.head()

In [None]:
# 計算報導者的可信度/權重
def get_reporter_weights(data):
    reporter_weights = data.groupby('reporter_id').apply(review_weight_averaging)
    
    return reporter_weights

reporter_weights = get_reporter_weights(data)
reporter_weights.head()

In [None]:
# 重覆多次
def simulate(times):
    for i in range(times):
        news = get_news(num, scale, reporters)
        reviews = get_reviews(num, scale, readers, news)
        judges = get_judges(num, scale, readers, reviews, news)
        data = get_data(news, reviews, judges)
        
        reviewer_weights = get_reviewer_weights(data)
        update_reader_weight(reviewer_weights, data)

simulate(5)

get_reporter_weights(data)

In [None]:
past_weights = reader_weight_holder.past_weights

for value in past_weights.values():
    plt.plot(range(len(value)), value)

plt.show()


In [None]:
data.count()

### 不公正的使用者

#### 評分不公正，評價公正

#### 評分公正，評價不公正

#### 評分和評價都不公正