# Automatic Fact Checking - Climate Claims

## Section 1: Data Process - a. Preprocessing

### 1. Load Data from JSON Files

#### 1.1 Load Original Data

In [1]:
import numpy as np
import pandas as pd
import random
import math

In [2]:
data_tran = pd.read_json("/root/COMP90042/data/data1/train-claims.json", orient='index')
data_vald = pd.read_json("/root/COMP90042/data/data1/dev-claims.json", orient='index')
data_test = pd.read_json("/root/COMP90042/data/data1/test-claims-unlabelled.json", orient='index')
data_evdn = pd.read_json("/root/COMP90042/data/data1/evidence.json", orient='index')

#### 1.2 Show Original Data

Training Data

In [3]:
data_tran.head(3)

Unnamed: 0,claim_text,claim_label,evidences
claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"


Validation Data

In [4]:
data_vald.head(3)

Unnamed: 0,claim_text,claim_label,evidences
claim-752,[South Australia] has the most expensive elect...,SUPPORTS,"[evidence-67732, evidence-572512]"
claim-375,when 3 per cent of total annual global emissio...,NOT_ENOUGH_INFO,"[evidence-996421, evidence-1080858, evidence-2..."
claim-1266,This means that the world is now 1C warmer tha...,SUPPORTS,"[evidence-889933, evidence-694262]"


Testing Data

In [5]:
data_test.head(3)

Unnamed: 0,claim_text
claim-2967,The contribution of waste heat to the global c...
claim-979,“Warm weather worsened the most recent five-ye...
claim-1609,Greenland has only lost a tiny fraction of its...


Evidence

In [6]:
data_evdn.head(3)

Unnamed: 0,0
evidence-0,"John Bennet Lawes, English entrepreneur and ag..."
evidence-1,Lindberg began his professional career at the ...
evidence-2,``Boston (Ladies of Cambridge)'' by Vampire We...


### 2. Preprocess Data

#### 2.1 Convert Claim ID & Evidence ID into Integer Type

In [7]:
def process_claim_id(df):
    df.index = df.index.str.replace('claim-', '', regex=False).astype(int)
    df = df.reset_index().rename(columns={'index': 'claim_id'})
    df = df.reset_index(drop=True)
    return df

def process_claim_evidence(df):
    df['evidences'] = df['evidences'].apply(lambda x: [int(e.replace('evidence-', '')) for e in x])
    return df

In [8]:
data_tran = process_claim_id(data_tran)
data_tran = process_claim_evidence(data_tran)

data_vald = process_claim_id(data_vald)
data_vald = process_claim_evidence(data_vald)

data_test = process_claim_id(data_test)

In [9]:
def process_evidence_id(df):
    df.index = df.index.str.replace('evidence-', '', regex=False).astype(int)
    df = df.reset_index().rename(columns={'index': 'evidence_id'})
    df = df.reset_index(drop=True)
    return df

def process_evidence_rename_column(df):
    df = df.rename(columns={0: 'evidence'})
    return df

In [10]:
data_evdn = process_evidence_id(data_evdn)
data_evdn = process_evidence_rename_column(data_evdn)

#### 2.2 Positive & Negative Sampling

In [11]:
def process_claim_sampling(df, data_evdn, total_evidence_count, negative_size=10):
    data_evdn = data_evdn[['evidence']]
    data_evdn.index = data_evdn.index.astype(int)

    def positive_sampling(evidence_ids):
        return {eid: data_evdn.loc[eid] for eid in evidence_ids if eid in data_evdn.index}

    def negative_sampling(existing_ids):
        all_possible_ids = list(set(range(total_evidence_count)) - set(existing_ids))
        sampled_ids = list(random.sample(all_possible_ids, negative_size))
        return positive_sampling(sampled_ids)

    df['evidence_positive'] = df['evidences'].apply(positive_sampling)
    df['evidence_negative'] = df['evidences'].apply(negative_sampling)

    return df

In [12]:
data_tran = process_claim_sampling(data_tran, data_evdn, len(data_evdn))
data_vald = process_claim_sampling(data_vald, data_evdn, len(data_evdn))

#### 2.3 Show Processed Data

Training Data

In [13]:
data_tran.head(3)

Unnamed: 0,claim_id,claim_text,claim_label,evidences,evidence_positive,evidence_negative
0,1937,Not only is there no scientific evidence that ...,DISPUTED,"[442946, 1194317, 12171]",{442946: ['At very high concentrations (100 ti...,{253996: ['Chereuta anthracistis is a moth in ...
1,126,El Niño drove record highs in global temperatu...,REFUTES,"[338219, 1127398]",{338219: ['While ‘climate change’ can be due t...,{163465: ['His song ``Perfect'' (featuring JR ...
2,2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[530063, 984887]",{530063: ['There is evidence of reversals in t...,{215844: ['This iguana feeds almost exclusivel...


Validation Data

In [14]:
data_vald.head(3)

Unnamed: 0,claim_id,claim_text,claim_label,evidences,evidence_positive,evidence_negative
0,752,[South Australia] has the most expensive elect...,SUPPORTS,"[67732, 572512]",{67732: ['[citation needed] South Australia ha...,{916140: ['It started officially broadcasting ...
1,375,when 3 per cent of total annual global emissio...,NOT_ENOUGH_INFO,"[996421, 1080858, 208053, 699212, 832334]",{996421: ['The 2011 UNEP Green Economy report ...,{871471: ['The Igor Cassini Show was a DuMont ...
2,1266,This means that the world is now 1C warmer tha...,SUPPORTS,"[889933, 694262]",{889933: ['Multiple independently produced ins...,{1047570: ['House is a historic house located ...


Testing Data

In [15]:
data_test.head(3)

Unnamed: 0,claim_id,claim_text
0,2967,The contribution of waste heat to the global c...
1,979,“Warm weather worsened the most recent five-ye...
2,1609,Greenland has only lost a tiny fraction of its...


Evidence

In [16]:
data_evdn.head(3)

Unnamed: 0,evidence_id,evidence
0,0,"John Bennet Lawes, English entrepreneur and ag..."
1,1,Lindberg began his professional career at the ...
2,2,``Boston (Ladies of Cambridge)'' by Vampire We...


### 3. Exploratory Data Analysis

#### 3.1 Summary Statistics

In [17]:
def count_text_length(df, text_column):
    return df[text_column].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

def count_list_length(df):
    return df['evidences'].apply(lambda x: len(x) if isinstance(x, list) else 0)

In [18]:
print("Data Summary Statistics:")
print("   1. Evidence")
print("      Original number of evidence = " + str(len(data_evdn)) + ".")
print("      Maximum number of tokens in evidence text = " + str(round(np.max(count_text_length(data_evdn, 'evidence')),0)) + ".")
print("      Minimum number of tokens in evidence text = " + str(round(np.min(count_text_length(data_evdn, 'evidence')),0)) + ".")
print("      Average number of tokens in evidence text = " + str(round(np.mean(count_text_length(data_evdn, 'evidence')),2)) + ".")
print("   2. Data Sets")
print("      (1) Training Data")
print("          Sample size of training data = " + str(len(data_tran)) + ".")
print("          Maximum number of tokens in claim text = " + str(round(np.max(count_text_length(data_tran, 'claim_text')),0)) + ".")
print("          Minimum number of tokens in claim text = " + str(round(np.min(count_text_length(data_tran, 'claim_text')),0)) + ".")
print("          Average number of tokens in claim text = " + str(round(np.mean(count_text_length(data_tran, 'claim_text')),2)) + ".")
print("          Maximum number of evidence for training data = " + str(round(np.max(count_list_length(data_tran)),0)) + ".")
print("          Minimum number of evidence for training data = " + str(round(np.min(count_list_length(data_tran)),0)) + ".")
print("          Average number of evidence for training data = " + str(round(np.mean(count_list_length(data_tran)),2)) + ".")
print("      (2) Developing Data")
print("          Sample size of developing data = " + str(len(data_vald)) + ".")
print("          Maximum number of tokens in claim text = " + str(round(np.max(count_text_length(data_vald, 'claim_text')),0)) + ".")
print("          Minimum number of tokens in claim text = " + str(round(np.min(count_text_length(data_vald, 'claim_text')),0)) + ".")
print("          Average number of tokens in claim text = " + str(round(np.mean(count_text_length(data_vald, 'claim_text')),2)) + ".")
print("          Maximum number of evidence for developing data = " + str(round(np.max(count_list_length(data_vald)),0)) + ".")
print("          Minimum number of evidence for developing data = " + str(round(np.min(count_list_length(data_vald)),0)) + ".")
print("          Average number of evidence for developing data = " + str(round(np.mean(count_list_length(data_vald)),2)) + ".")
print("      (3) Testing Data")
print("          Sample size of testing test = " + str(len(data_test)) + ".")
print("          Maximum number of tokens in claim text = " + str(round(np.max(count_text_length(data_test, 'claim_text')),0)) + ".")
print("          Minimum number of tokens in claim text = " + str(round(np.min(count_text_length(data_test, 'claim_text')),0)) + ".")
print("          Average number of tokens in claim text = " + str(round(np.mean(count_text_length(data_test, 'claim_text')),2)) + ".")

Data Summary Statistics:
   1. Evidence
      Original number of evidence = 1208827.


      Maximum number of tokens in evidence text = 479.
      Minimum number of tokens in evidence text = 1.
      Average number of tokens in evidence text = 19.69.
   2. Data Sets
      (1) Training Data
          Sample size of training data = 1228.
          Maximum number of tokens in claim text = 67.
          Minimum number of tokens in claim text = 4.
          Average number of tokens in claim text = 20.1.
          Maximum number of evidence for training data = 5.
          Minimum number of evidence for training data = 1.
          Average number of evidence for training data = 3.36.
      (2) Developing Data
          Sample size of developing data = 154.
          Maximum number of tokens in claim text = 65.
          Minimum number of tokens in claim text = 4.
          Average number of tokens in claim text = 21.08.
          Maximum number of evidence for developing data = 5.
          Minimum number of evidence for developing data = 1.
          Average number of eviden

In [19]:
import pandas as pd
from pyecharts.charts import Bar
from pyecharts import options as opts

def plot_histogram(series, step=1, title="Histogram", x_label="Value", y_label="Frequency"):

    step = 1

    data = series.dropna()
    min_value, max_value = 0, 100

    bins = list(range(min_value, max_value + step, step))
    categories = pd.cut(data, bins=bins, right=False) 
    hist = categories.value_counts(sort=False) 

    c = (
        Bar()
        .add_xaxis([f"{interval.left}" for interval in hist.index])
        .add_yaxis(series_name="Frequency", y_axis=hist.tolist())
        .set_global_opts(
            title_opts=opts.TitleOpts(title=title),
            xaxis_opts=opts.AxisOpts(name=x_label),
            yaxis_opts=opts.AxisOpts(name=y_label),
        )
    )

    # 在 Jupyter 中显示图表
    return c.render_notebook()

#### 3.2 Plots

Evidence Length (maximum 100 words)

In [20]:
plot_histogram(count_text_length(data_evdn, 'evidence'), title="Evidence Length")

Claim Length: Training + Validation + Testing (maximum 100 words)

In [21]:
plot_histogram(count_text_length(pd.concat([data_tran, data_vald, data_test]), 'claim_text'), title="Claim Length")

### 4. Save Data

In [22]:
data_tran.to_json('/root/COMP90042/data/data2/data_tran.json', orient='records', lines=True)
data_vald.to_json('/root/COMP90042/data/data2/data_vald.json', orient='records', lines=True)
data_test.to_json('/root/COMP90042/data/data2/data_test.json', orient='records', lines=True)
data_evdn.to_json('/root/COMP90042/data/data2/data_evdn.json', orient='records', lines=True)

In [23]:
data_tran

Unnamed: 0,claim_id,claim_text,claim_label,evidences,evidence_positive,evidence_negative
0,1937,Not only is there no scientific evidence that ...,DISPUTED,"[442946, 1194317, 12171]",{442946: ['At very high concentrations (100 ti...,{253996: ['Chereuta anthracistis is a moth in ...
1,126,El Niño drove record highs in global temperatu...,REFUTES,"[338219, 1127398]",{338219: ['While ‘climate change’ can be due t...,{163465: ['His song ``Perfect'' (featuring JR ...
2,2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[530063, 984887]",{530063: ['There is evidence of reversals in t...,{215844: ['This iguana feeds almost exclusivel...
3,2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[1177431, 782448, 540069, 352655, 1007867]",{1177431: ['There is no convincing scientific ...,"{807793: ['(1545 -- 1624), Japanese samurai'],..."
4,2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[1010750, 91661, 722725, 554161, 430839]",{1010750: ['With average temperature +8.1 °C (...,{37443: ['Nastradamus is the fourth studio alb...
...,...,...,...,...,...,...
1223,1504,Climate scientists say that aspects of the cas...,SUPPORTS,"[1055682, 1047356, 759337, 879500]","{1055682: ['""It's a fact: climate change made ...",{137625: ['He has managed the Hershey Wildcats...
1224,243,"In its 5th assessment report in 2013, the IPCC...",SUPPORTS,[916755],{916755: ['The scientific consensus as of 2013...,{995890: ['Although Best made debut against We...
1225,2302,"Since the mid 1970s, global temperatures have ...",NOT_ENOUGH_INFO,"[403673, 889933, 1120350, 195369, 37517]","{403673: ['Global Warming of 1.5 °C.'], 889933...",{1097427: ['Cotter was the son of Sir James La...
1226,502,But abnormal temperature spikes in February an...,NOT_ENOUGH_INFO,"[97375, 562427, 521257, 117389, 583187]",{97375: ['A lower air temperature of −94.7 °C ...,{368010: ['Guwahati's branch of Vivekananda Ke...
