In [1]:
import os

In [2]:
random_seed = 42

In [3]:
import csv

In [4]:
import random
random.seed(random_seed)
import numpy as np
np.random.seed(random_seed)
import pandas as pd
pd.set_option('max_colwidth', 256)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.metrics import classification_report

In [7]:
def read_data(filename):
    df = pd.read_json(filename, lines=True)
    df.dropna()
    ####
    df = df[['gold_label', 'sentence1', 'sentence2']]
    df.columns = ['label', 'sentence1', 'sentence2']
    df['label'] = df['label'].map({'contradiction': 0, 'neutral': 0, 'entailment': 1, '-': 3})
    df = df[df['label']!=3]    
    ####
    df_entail = df[df['label']==1]
    df_non_entail = df[df['label']==0].sample(len(df_entail), random_state=42) # 变成二分类
    df = df_entail.append(df_non_entail)
    ####
    return df

In [8]:
data_path = "data/SNLI/snli_1.0/"

In [9]:
train = read_data(data_path+"snli_1.0_{}.jsonl".format('train'))
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 366832 entries, 2 to 412095
Data columns (total 3 columns):
label        366832 non-null int64
sentence1    366832 non-null object
sentence2    366832 non-null object
dtypes: int64(1), object(2)
memory usage: 11.2+ MB


In [10]:
train['label'].value_counts()

1    183416
0    183416
Name: label, dtype: int64

In [11]:
pool = train.copy()

In [12]:
pool, train = train_test_split(pool, test_size=10000, shuffle=True, random_state=random_seed, stratify=pool['label'])

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 486168 to 262079
Data columns (total 3 columns):
label        10000 non-null int64
sentence1    10000 non-null object
sentence2    10000 non-null object
dtypes: int64(1), object(2)
memory usage: 312.5+ KB


In [14]:
train[train['sentence1'].str.contains("guy") & train['sentence2'].str.contains("male")]

Unnamed: 0,label,sentence1,sentence2
72346,0,Guy and girl in jackets while guy listens to mobile device,A man and a woman are outside while the male listens to a podcast.
71431,1,A guy playing a game at a fair.,A male is at the fair


In [15]:
train[train['sentence2'].str.contains("guy") & train['sentence1'].str.contains("male")]

Unnamed: 0,label,sentence1,sentence2
359615,1,A black-haired male is playing a video system on his computer while wearing a watch.,a guy plays video games


In [16]:
train[train['sentence1'].str.contains("iPod")]

Unnamed: 0,label,sentence1,sentence2
29479,0,A black man holding a basketball while listening to music on his iPod.,A white man holding a basketball while listening to music on his iPod.
289352,0,A man sits on a folding chair outside while listening to music on his iPod.,The man stands on the sidewalk listening to the marching band that is passing by.
68801,1,A slender young caucasian female walks while listening to music through her headphones and iPod during a bright sunny day.,There is a female walking while listening to music on her headphones.
289350,1,A man sits on a folding chair outside while listening to music on his iPod.,There is a man on a chair listening to music on an mp3 player.


In [38]:
train[train['sentence1'].str.contains("iPod") & train['sentence2'].str.contains("mp3")]

Unnamed: 0,label,sentence1,sentence2
289350,1,A man sits on a folding chair outside while listening to music on his iPod.,There is a man on a chair listening to music on an mp3 player.


In [39]:
train[train['sentence2'].str.contains("iPod") & train['sentence1'].str.contains("mp3")]

Unnamed: 0,label,sentence1,sentence2


In [14]:
dev = read_data(data_path+"snli_1.0_{}.jsonl".format('dev'))
dev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6658 entries, 1 to 5352
Data columns (total 3 columns):
label        6658 non-null int64
sentence1    6658 non-null object
sentence2    6658 non-null object
dtypes: int64(1), object(2)
memory usage: 208.1+ KB


In [15]:
test = read_data(data_path+"snli_1.0_{}.jsonl".format('test'))
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6736 entries, 1 to 6141
Data columns (total 3 columns):
label        6736 non-null int64
sentence1    6736 non-null object
sentence2    6736 non-null object
dtypes: int64(1), object(2)
memory usage: 210.5+ KB


In [16]:
train['sample_index'] =  list(range(len(train)))
train.head()

Unnamed: 0,label,sentence1,sentence2,sample_index
486168,1,A military branch prepare for their duties.,a child was there,0
340138,1,A person in makeup in a traditional ceremony.,A person takes part in a traditional ritual,1
275007,0,A man stands outside a blue building holding a white helmet.,A man is laying in a hammock that is strung between trees in his backyard.,2
458835,1,The girl in pink does a midair leg splits.,The girl in pink is flexible.,3
479397,1,A woman with orange hair sitting behind a table with Tye-dyed t-shirts on them.,A woman with orange hair.,4


In [17]:
train.tail()

Unnamed: 0,label,sentence1,sentence2,sample_index
139507,1,A young man who loves nature.,A person loves nature.,9995
213909,1,Two people with dark jackets are sitting beside each other.,Two people are sitting next to each other.,9996
290513,1,A happy couple sitting together.,A couple is sitting together smiling.,9997
44491,1,A young man in a black hoodie is sitting on a curb.,A young person in a black hoodie is sitting on a curb.,9998
262079,0,A little boy lying down looking under a table.,a little boy stands on the table.,9999


In [18]:
train['label'].value_counts()

1    5000
0    5000
Name: label, dtype: int64

In [18]:
dev['sample_index'] = list(range(len(dev)))
dev.head()

Unnamed: 0,label,sentence1,sentence2,sample_index
1,1,Two women are embracing while holding to go packages.,Two woman are holding packages.,0
3,1,"Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.",Two kids in numbered jerseys wash their hands.,1
8,1,A man selling donuts to a customer during a world exhibition event held in the city of Angeles,A man selling donuts to a customer.,2
9,1,"Two young boys of opposing teams play football, while wearing full protection uniforms and helmets.",boys play football,3
13,1,A man in a blue shirt standing in front of a garage-like structure painted with geometric designs.,A man is wearing a blue shirt,4


In [19]:
dev['label'].value_counts()

1    3329
0    3329
Name: label, dtype: int64

In [20]:
test['sample_index'] = list(range(len(test)))
test.head()

Unnamed: 0,label,sentence1,sentence2,sample_index
1,1,This church choir sings to the masses as they sing joyous songs from the book at a church.,The church is filled with song.,0
4,1,"A woman with a green headscarf, blue shirt and a very big grin.",The woman is very happy.,1
6,1,An old man with a package poses in front of an advertisement.,A man poses in front of an ad.,2
10,1,A statue at a museum that no seems to be looking at.,There is a statue that not many people seem to be interested in.,3
12,1,A land rover is being driven across a river.,A Land Rover is splashing water as it crosses a river.,4


In [21]:
test['label'].value_counts()

1    3368
0    3368
Name: label, dtype: int64

In [22]:
train.to_csv('data/train.csv', index=False)
dev.to_csv('data/dev.csv', index=False)
test.to_csv('data/test.csv', index=False)

In [23]:
total = len(train)
for percentage in range(0, 100, 10):
    k = int(total*(percentage/100))
    print(percentage, k)
    
    tmp = train.sample(k, 
                       random_state=0
                      )
    
    tmp = train.drop(tmp.index)
    print(tmp['label'].value_counts())
    
    filename = "data/random_0/{}.csv".format(percentage)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    tmp[['label', 'sentence1', 'sentence2', 'sample_index']].to_csv(filename, index=False)

0 0
1    5000
0    5000
Name: label, dtype: int64
10 1000
0    4513
1    4487
Name: label, dtype: int64
20 2000
0    4007
1    3993
Name: label, dtype: int64
30 3000
0    3508
1    3492
Name: label, dtype: int64
40 4000
1    3004
0    2996
Name: label, dtype: int64
50 5000
1    2530
0    2470
Name: label, dtype: int64
60 6000
1    2046
0    1954
Name: label, dtype: int64
70 7000
1    1526
0    1474
Name: label, dtype: int64
80 8000
1    1004
0     996
Name: label, dtype: int64
90 9000
1    509
0    491
Name: label, dtype: int64


In [24]:
total = len(train)
for percentage in range(0, 100, 10):
    k = int(total*(percentage/100))
    print(percentage, k)
    
    tmp = train.sample(k, 
                       random_state=2
                      )

    tmp = train.drop(tmp.index)
    print(tmp['label'].value_counts())
    
    filename = "data/random_2/{}.csv".format(percentage)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    tmp[['label', 'sentence1', 'sentence2', 'sample_index']].to_csv(filename, index=False)

0 0
1    5000
0    5000
Name: label, dtype: int64
10 1000
1    4517
0    4483
Name: label, dtype: int64
20 2000
1    4030
0    3970
Name: label, dtype: int64
30 3000
1    3529
0    3471
Name: label, dtype: int64
40 4000
1    3014
0    2986
Name: label, dtype: int64
50 5000
0    2501
1    2499
Name: label, dtype: int64
60 6000
1    2014
0    1986
Name: label, dtype: int64
70 7000
1    1520
0    1480
Name: label, dtype: int64
80 8000
1    1026
0     974
Name: label, dtype: int64
90 9000
0    503
1    497
Name: label, dtype: int64


In [25]:
total = len(train)
for percentage in range(0, 100, 10):
    k = int(total*(percentage/100))
    print(percentage, k)
    
    tmp = train.sample(k, 
                       random_state=42
                      )
    
    tmp = train.drop(tmp.index)
    print(tmp['label'].value_counts())
    
    filename = "data/random/{}.csv".format(percentage)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    tmp[['label', 'sentence1', 'sentence2', 'sample_index']].to_csv(filename, index=False)

0 0
1    5000
0    5000
Name: label, dtype: int64
10 1000
0    4503
1    4497
Name: label, dtype: int64
20 2000
1    4009
0    3991
Name: label, dtype: int64
30 3000
1    3521
0    3479
Name: label, dtype: int64
40 4000
1    3032
0    2968
Name: label, dtype: int64
50 5000
1    2541
0    2459
Name: label, dtype: int64
60 6000
1    2017
0    1983
Name: label, dtype: int64
70 7000
1    1501
0    1499
Name: label, dtype: int64
80 8000
0    1003
1     997
Name: label, dtype: int64
90 9000
0    519
1    481
Name: label, dtype: int64
