In [74]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

# Any results you write to the current directory are saved as output.

In [81]:
train = pd.read_csv("../data/dataset/train.csv").fillna("no_comments")
test = pd.read_csv("../data/dataset/test.csv")

In [82]:
train = train[train.label != 'unrelated']

In [83]:
train['spn_1_hash'] = train['title1_zh']
train['spn_2_hash'] = train['title2_zh']

In [84]:
train['label'] = (train['label'] != 'disagreed').astype(int)

In [85]:
train['label'].value_counts()

1    92973
0     8266
Name: label, dtype: int64

In [90]:
def same_order(a, b):
    return a, b

In [91]:
from collections import defaultdict
graph = defaultdict(set)
pos_edges = set()
neg_edges = set()
edges = set()

def build_graph(row):
    node_1 = row['spn_1_hash']
    node_2 = row['spn_2_hash']
    label = row['label']
        
    graph[node_2].add(node_1) # node_2 must connect to node_1

    if label:
        # in this case, it's bidirectional
        pos_edges.add(same_order(node_1, node_2))
        pos_edges.add(same_order(node_2, node_1))
        graph[node_1].add(node_2)
    else:
        neg_edges.add(same_order(node_2, node_1))
    edges.add(same_order(node_2, node_1))
    
n = train[['spn_1_hash', 'spn_2_hash', 'label']].apply(build_graph, axis=1)

In [92]:
pos_augments = set()
neg_augments = set()

In [95]:
def add_pos_edges():
    counter = 0
    ncc = 0
    pos_counter = 0
    tricky_pairs = set()
    # TRIANGLE CASE, A->B and B->C and A->C, if A=B, will A=C and B=C ?
    for src, dst in pos_edges:
        src_point_to = graph[src]
        dst_point_to = graph[dst]
        src_dst_both_point_to = src_point_to.intersection(dst_point_to) # A point to C, and also B point to C

        if len(src_dst_both_point_to) == 0:
            ncc += 1

        for v in src_dst_both_point_to:
            if (dst, v) in pos_edges:
                if (src, v) in pos_edges:
                    pos_counter += 1
                else:
                    print("POS-Tricky", src, "|", v)
                    tricky_pairs.add(same_order(src, v))
                counter += 1
    print("pos_counter", pos_counter, "counter", counter)
    print("Triangle case", pos_counter / counter)
    print("NCC", ncc)
    
def add_neg_edges():
    counter = 0
    neg_counter = 0
    tricky_pairs = set()

    for src, dst in neg_edges:
        src_point_to = graph[src]
        dst_point_to = graph[dst]
        src_dst_both_point_to = src_point_to.intersection(dst_point_to)

        for v in src_dst_both_point_to:
            if (dst, v) in pos_edges:
                if (src, v) in neg_edges:
                    neg_counter += 1
                else:
                    print("NEG-Tricky", src, "|", v)
                    tricky_pairs.add(same_order(src, v))
                counter += 1
    print('neg_counter', neg_counter, 'counter', counter)
    print("Triangle case", neg_counter / counter)

def augments():
    counter = 0
    neg_counter = 0

    for src, dst in pos_edges:
        src_point_to = graph[src]
        dst_point_to = graph[dst]

        dst_outs = dst_point_to - src_point_to
        for v in dst_outs:
            if (dst, v) in pos_edges:
                pos_augments.add((src, v))

    for src, dst in neg_edges:
        src_point_to = graph[src]
        dst_point_to = graph[dst]

        dst_outs = dst_point_to - src_point_to
        for v in dst_outs:
            if (dst, v) in pos_edges:
                neg_augments.add((src, v))    

    print("Augmented pos cases", len(pos_augments))
    print("Augmented neg cases", len(neg_augments))
    
def add_augmented_links():
    for src, dst in pos_augments:
        graph[src].add(dst)
        pos_edges.add(same_order(src, dst))
        edges.add(same_order(src, dst))
    
    for src, dst in neg_augments:
        graph[src].add(dst)
        neg_edges.add(same_order(src, dst))
        edges.add(same_order(src, dst))

In [96]:
i = 0

while(True):
    i += 1
    print("In interation", i)
    
    pos_size = len(pos_augments)
    neg_size = len(neg_augments)
    
    add_pos_edges()
    add_neg_edges()
    augments()
    add_augmented_links()
    
    if len(pos_augments) == pos_size and len(neg_augments) == neg_size:
        print("Finished")
        break

In interation 1
POS-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？ | 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？
POS-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？ | 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？
POS-Tricky “十年鸡头胜砒霜”网上疯传的农产品谣言，真相到底是什么？ | 十年鸡头如砒霜？医生建议：这8种肉最好别吃
POS-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？ | 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？
POS-Tricky 周润发因病去世？发嫂发声辟谣：我老公很好 | 周润发因病去世？发嫂首次回应，50亿捐款已经办妥！
pos_counter 556327 counter 556332
Triangle case 0.999991012560845
NCC 52816
NEG-Tricky “十年鸡头胜砒霜”网上疯传的农产品谣言，真相到底是什么？ | 农村俗话讲“十年鸡头胜砒霜”，这句话能相信吗？
NEG-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？ | 「币圈下午茶」币安3天内流入5万枚BTC /市值前30的币种全部下跌
NEG-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？ | 3天内币安流入5万枚BTC，在密谋些什么？
NEG-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？ | 「迅解区块链」3天内流入5万枚BTC，币安到底在密谋些什么呢？
NEG-Tricky 周润发因病去世？发嫂发声辟谣：我老公很好 | 周润发因病去世？发嫂终于回应，称50亿捐款已办妥！
neg_counter 2492 counter 2497
Triangle case 0.9979975971165399
Augmented pos cases 484526
Augmented neg cases 7995
In interation 2
POS-Tricky 周润发因病去世？发嫂发声辟谣：我老公很好 | 周润发因病去世？发嫂首次回应，50亿捐款已经办妥！
POS-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？

NEG-Tricky 周润发因病去世？发嫂发声辟谣：我老公很好 | 一代影视巨星周润发，发哥因病在香港去世，发嫂做出回应！
NEG-Tricky 周润发因病去世？发嫂发声辟谣：我老公很好 | 周润发因病去世？香港新闻截图疯传 发嫂：大吉大利！
NEG-Tricky 周润发因病去世？发嫂发声辟谣：我老公很好 | 周润发因病去世？香港新闻截图疯传发嫂发声：大吉大利我老公很好
NEG-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？ | 「币圈下午茶」币安3天内流入5万枚BTC /市值前30的币种全部下跌
NEG-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？ | 3天内币安流入5万枚BTC，在密谋些什么？
NEG-Tricky 传言币安三天流入5万枚比特币！官微辟谣：还能再假一点吗？ | 「迅解区块链」3天内流入5万枚BTC，币安到底在密谋些什么呢？
neg_counter 649842 counter 649862
Triangle case 0.9999692242352992
Augmented pos cases 707106
Augmented neg cases 19091
Finished


In [97]:
test = pd.read_csv("../data/dataset/test.csv")

In [98]:
neg_samples = set([v[0] for v in neg_augments])

In [118]:
def mark_neg(row):
    if (row['title2_zh'], row['title1_zh']) in neg_augments:
        return 1
    return 0

def mark_pos(row):
    if (row['title1_zh'], row['title2_zh']) in pos_augments:
        return 1
    if (row['title2_zh'], row['title1_zh']) in pos_augments:
        return 1    
    return 0

def mark(row):
    if (row['title1_zh'], row['title2_zh']) in pos_augments:
        return 'agreed'
    if (row['title2_zh'], row['title1_zh']) in pos_augments:
        return 'agreed'
    if (row['title2_zh'], row['title1_zh']) in neg_augments:
        return 'disagreed'
    return 'failed'
        
test['mark_neg'] = test.apply(lambda row: mark_neg(row), axis=1)
test['mark_pos'] = test.apply(lambda row: mark_pos(row), axis=1)
test['deal_with_the_devil'] = test.apply(lambda row: mark(row), axis=1)

In [119]:
test['deal_with_the_devil'].value_counts()

failed       73238
agreed        6774
disagreed      114
Name: deal_with_the_devil, dtype: int64

In [112]:
test['mark_neg'].sum()

114

In [113]:
test['mark_pos'].sum()

6774

In [116]:
best_predictions = pd.read_csv('../data/high_ground/final_answer.csv')

# Check improvement

In [123]:
labeled = test['deal_with_the_devil'] != 'failed'

In [126]:
best_predictions[labeled]['Category'].value_counts()

agreed       5956
unrelated     843
disagreed      89
Name: Category, dtype: int64

In [159]:
print("Difference", (best_predictions[labeled]['Category'] != test[labeled]['deal_with_the_devil']).sum())

Difference 0


In [143]:
best_predictions['Category'].value_counts()

unrelated    51536
agreed       26490
disagreed     2100
Name: Category, dtype: int64

In [148]:
best_predictions['Category'].value_counts() / len(best_predictions)

unrelated    0.632666
agreed       0.340813
disagreed    0.026521
Name: Category, dtype: float64

In [138]:
len((test[labeled]['deal_with_the_devil']).values)

6888

In [144]:
best_predictions['Fake'] = test['deal_with_the_devil'].values

In [145]:
def deal_with_the_devil(row):
    if row['Fake'] != 'failed' and row['Fake'] != row['Category']:
        return row['Fake']
    return row['Category']
    

best_predictions['Category'] = best_predictions[['Category', 'Fake']].apply(lambda row: deal_with_the_devil(row), axis=1)

In [157]:
best_predictions = best_predictions.drop(['Fake'], axis=1)

In [160]:
best_predictions.to_csv("../data/high_ground/final_answer.csv", index=False)

In [158]:
best_predictions

Unnamed: 0,Category,Id
0,unrelated,321187
1,unrelated,321190
2,unrelated,321189
3,unrelated,321193
4,unrelated,321191
5,unrelated,321194
6,unrelated,321192
7,unrelated,321197
8,unrelated,321195
9,agreed,321199
