# EDA

In [1]:
!pip install torch
!pip install transformers
!pip install numpy
!pip install pandas
!pip install sentence-transformers



In [2]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import nn
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import random

In [3]:
BASE_PATH = os.getcwd()
DATA_PATH = f"{BASE_PATH}/data"

In [4]:
def seed_everything(seed=1):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 1
seed_everything(seed=SEED)
MAX_LENGTH = 256

In [5]:
df = pd.read_csv(f"{DATA_PATH}/train.csv", index_col="id")
df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2834 non-null   object 
 1   url_legal       830 non-null    object 
 2   license         830 non-null    object 
 3   excerpt         2834 non-null   object 
 4   target          2834 non-null   float64
 5   standard_error  2834 non-null   float64
dtypes: float64(2), object(4)
memory usage: 133.0+ KB


In [9]:
df["target"].describe()

count    2834.000000
mean       -0.959319
std         1.033579
min        -3.676268
25%        -1.690320
50%        -0.912190
75%        -0.202540
max         1.711390
Name: target, dtype: float64

In [19]:
df[df["target"] == df["target"].min()]["excerpt"]

1705    The commutator is peculiar, consisting of only...
Name: excerpt, dtype: object

In [20]:
df[df["target"] == df["target"].max()]["excerpt"]

2829    When you think of dinosaurs and where they liv...
Name: excerpt, dtype: object

Our higher number corresponds with easier readability, so our goal with this should be to find a model that can quickly discriminate between our top quartile as easily readable with everything else being looked at for revision.

In our training, if the current dataset is insufficient we should explore enhancing our dataset through generating similar sentences from the sentences that are in our dataset. We should be able to do this through sentence embeddings, or something similar.