In [None]:
 # To install all required packages, run this cell (can be left out otherwise)
!pip install pandas==1.4.0 zstandard gensim sparknlp pyspark

In [1]:
import pandas as pd
import gensim
import json
import gc
import tempfile

from wefe.metrics import RNSB, WEAT
from wefe.query import Query
from wefe.word_embedding_model import WordEmbeddingModel


from os import path
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, explode, lit, sentences
from datasets import load_dataset
from gensim.utils import simple_preprocess

## Parsing input query

In [2]:
def parse_input_json(file_path):
    """Method to parse a given input file.

    Keyword arguments:
    file_path -- path to file containing the input parameters

    Return:
    data -- parsed data from the input file
    """
    print(file_path)
    # Opening JSON file
    f = open(file_path)
    
    print(f)

    # returns JSON object as a dictionary
    data = json.load(f)

    f.close()

    return data

def fetch_queries(input_queries):
    """Method to fetch queries from data/raw/queries.json file.
    """
    query_sets = []
    for input_query in input_queries:
        query = {}
        if 'queries_path_name' in input_query:
            # When path to a queries file is given

            # declaring local variables for target_sets and attribute_sets
            # query = {}
            target_sets = []
            attribute_sets = []

            # Opening JSON file
            query_file = open(input_query['queries_path_name'])

            # returns queries JSON object as a dictionary
            raw_query_data = json.load(query_file)
            
            target_sets.append(raw_query_data['target_sets'][input_query['target_sets_names'][0]][input_query['target_sets_names'][1]]['set'])
            target_sets.append(raw_query_data['target_sets'][input_query['target_sets_names'][0]][input_query['target_sets_names'][2]]['set'])
            
            # print(raw_query_data['attribute_sets'][input_query['attribute_sets_names'][0]]['set'])
            attribute_sets.append(raw_query_data['attribute_sets'][input_query['attribute_sets_names'][0]]['set'])
            attribute_sets.append(raw_query_data['attribute_sets'][input_query['attribute_sets_names'][1]]['set'])

            # fills up the query dictionary
            query['target_sets'] = target_sets
            query['attribute_sets'] = attribute_sets
            query['target_sets_names'] = [input_query['target_sets_names'][1], input_query['target_sets_names'][2]]
            query['attribute_sets_names'] = input_query['attribute_sets_names']

            # Closing file
            query_file.close()

        else:
            # when query target and attribute sets are explicitly given
            query['target_sets'] = input_query['target_sets']
            query['attribute_sets'] = input_query['attribute_sets']
            query['target_sets_names'] = input_query['target_sets_names']
            query['attribute_sets_names'] = input_query['attribute_sets_names']
                                             
        query_sets.append(query)

    return query_sets

## Loading the data

In [3]:
 # Define data set paths
THE_PILE_BASE_PATH = path.join("file:///", "mnt", "ceph", "storage", "corpora", "corpora-thirdparty", "the-pile")
val_data_path = path.join(THE_PILE_BASE_PATH, "val.jsonl.zst")
test_data_path = path.join(THE_PILE_BASE_PATH, "test.jsonl.zst")
train_data_paths = [path.join(THE_PILE_BASE_PATH, "train", f"{str(n).zfill(2)}.jsonl.zst") for n in range(0, 30)]

print(train_data_paths)

['file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/00.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/01.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/02.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/03.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/04.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/05.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/06.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/07.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/08.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/09.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/10.jsonl.zst', 'file:///mnt/ceph/storage/corpora/corpora-thirdparty/the-pile/train/11.jsonl.zst', 'fi

In [4]:
# Data set selection
data_selection = [
    'OpenWebText2'
    'PubMed Abstracts',
    'StackExchange',
    # 'Github', # Currently ignoring because we don't want the code
    'Enron Emails',
    'FreeLaw',
    'USPTO Backgrounds',
    'Pile-CC',
    'Wikipedia (en)',
    'Books3',
    'PubMed Central',
    'HackerNews',
    'Gutenberg (PG-19)'
    # 'DM Mathematics', # Currently ignoring because we don't want math formulas
    'NIH ExPorter',
    'ArXiv',
    'BookCorpus2',
    'OpenSubtitles',
    'YoutubeSubtitles',
    'Ubuntu IRC',
    # 'EuroParl', # Currently ignoring because we'll focus on English text for now
    'PhilPapers'
]

In [5]:
# Variable declarations
model_flag = False
model_filepath = "../models/pile/gensim_word2vec_pile" 

## Word embedding models

In [6]:
def word2vec_model(text, workers):
    """Instantiates a word2vec model and builds vocabulary for the model.

    Keyword arguments:
    text -- dataset, set of tokenized sentences
    workers -- no of workers used to train word2vec model

    Return:
    model -- instance of a word2vec model
    """
    # Instantiates gensim implementation of a Word2Vec model
    model = gensim.models.Word2Vec(workers = workers)

    # builds vocabulary for the model
    print("\nBuilding vocabulary")
    model.build_vocab(text, progress_per=10000)
    return model

## Metrics evaluation

In [7]:
def evaluate_metrics(model, query, model_name):
    new_query = Query(target_sets = query_data[0]['target_sets'],
                               attribute_sets = query_data[0]['attribute_sets'],
                               target_sets_names = query_data[0]['target_sets_names'],
                               attribute_sets_names = query_data[0]['attribute_sets_names']
                               )


    we_model = WordEmbeddingModel(model.wv, model_name)
    
    weat_res = WEAT().run_query(new_query, we_model, lost_vocabulary_threshold=0.5, calculate_p_value=True, p_value_iterations=15000)
    
    print(we_model.name, weat_res['query_name'], 'WEAT', weat_res['weat'])
    
    rnsb_res = RNSB().run_query(new_query, we_model, calculate_p_value=True, lost_vocabulary_threshold=0.5, p_value_iterations=15000)
    
    print(we_model.name, rnsb_res['query_name'], 'RNSB', rnsb_res['rnsb'])

## Saving models, loading models and free memory

In [8]:
def save_model(model):
    model_filepath = "../models/pile/gensim_word2vec_pile" 
    model.save(model_filepath)
    print("Model Saved")
    save_model_flag = True
    
    return save_model_flag

def load_model(model_filepath):
    print("Loading model from : " + model_filepath)
    new_model = gensim.models.Word2Vec.load(model_filepath)
    
    return new_model

def clear_variables():
    del data_chunk, filtered_data, df_text, processed_text, model
    gc.collect()

## Training in batches

In [9]:
# Train word embeddings using Pile data in batches

# add start time variable

# Parse input file
input_data = parse_input_json("../data/experiments/case1_new_query.json")

# Fetching queries from input file
query_data = fetch_queries(input_data['queries'])

# Iterating through training paths
for file_path in train_data_paths:
    print(file_path)
    # Read 'zstd' compressed file and iterate through the file in batches of given chunksize 
    with pd.read_json(file_path, lines=True, chunksize=1000000, compression='zstd') as reader: 
        reader
        for data_chunk in reader:
            # Transform set name column to make it easier to work with
            data_chunk["meta_str"] = data_chunk["meta"].apply(lambda x: x["pile_set_name"])
            print(len(data_chunk))
        
            # Only select the data we are interested in for now
            filtered_data = data_chunk[data_chunk["meta_str"].isin(data_selection)][["text", "meta_str"]]
            print(len(filtered_data))
            
            # Convert filtered data to pandas dataframe for easier processing
            df_text = pd.DataFrame(filtered_data['text']).reset_index()

            # Apply simple preprocessing(tokenization, removal of punctuations and special characters) on text
            # measure how much time?? How to make apply function on multiple cores?
            processed_text = df_text['text'].apply(simple_preprocess)
            
            if model_flag == False:
                # Instantiate word2vec and build vocabulary
                model = word2vec_model(processed_text, workers=8)
                print(model)
            else:
                model = load_model(model_filepath)
                print(model)
                
            # Training model
            model.train(processed_text, total_examples= model.corpus_count, epochs= model.epochs)
            
            # Saving model
            model_flag = save_model(model)
            
            # Clear variables and free memory
            # clear_variables()
            del data_chunk, filtered_data, df_text, processed_text, model
            gc.collect()
            
# Evaluate metrics
# evaluate_metrics(word2vec, query_data[0], "word2vec")

../data/experiments/case1_new_query.json
<_io.TextIOWrapper name='../data/experiments/case1_new_query.json' mode='r' encoding='UTF-8'>


FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/queries.json'

### Ignore following cells

In [None]:
print(train_data_paths[1])

with pd.read_json(train_data_paths[1], lines=True, chunksize=1000000, compression='zstd') as reader:
    reader
    len = 0
    for chunk in reader:
        len+= 1
    print(len)

In [None]:
 # Read data
# val_data = pd.read_json(val_data_path, lines=True, compression='zstd')

train_data = pd.read_json(train_data_paths[0], lines=True, chunksize=234021, compression='zstd')

print(len(train_data))
# Transform set name column to make it easier to work with
val_data["meta_str"] = val_data["meta"].apply(lambda x: x["pile_set_name"])

## Filtering and Preprocessing

In [None]:
 # Only select the data we are interested in for now
filtered_data = val_data[val_data["meta_str"].isin(data_selection)][["text", "meta_str"]]

text1 = []
text1 = filtered_data['text']

df_text = pd.DataFrame(text1).reset_index()

# Apply simple preprocessing(tokenization, removal of punctuations and special characters) on text
proc_text = df_text['text'].apply(simple_preprocess)

In [None]:
print(len(filtered_data))

## Train word embedding models

In [None]:
# Instantiate word2vec and build vocabulary
word2vec = word2vec_model(proc_text, workers=8)
print(word2vec)

# Training model
word2vec.train(proc_text, total_examples=word2vec.corpus_count, epochs=word2vec.epochs)

In [None]:
def fasttext_model(text, workers):
    """Instantiates a fasttext model and builds vocabulary for the model.

    Keyword arguments:
    text -- dataset, set of tokenized sentences
    workers -- no of workers used to train

    Return:
    model -- instance of a fasttext model
    """
    # Instantiates gensim implementation of a Fasttext model
    model = gensim.models.FastText()
    
    print("\nBuilding vocabulary")
    # build the vocabulary
    model.build_vocab(corpus_file=corpus_file)
    
    return model
    

In [None]:
# Instantiate fasttext and build vocabulary
fasttext = fasttext_model(proc_text, workers=8)

## Parsing input query and metrics evaluation

In [None]:
def evaluate_metrics(model, query, model_name):
    new_query = Query(target_sets = query_data[0]['target_sets'],
                               attribute_sets = query_data[0]['attribute_sets'],
                               target_sets_names = query_data[0]['target_sets_names'],
                               attribute_sets_names = query_data[0]['attribute_sets_names']
                               )


    we_model = WordEmbeddingModel(model.wv, model_name)
    
    weat_res = WEAT().run_query(new_query, we_model, lost_vocabulary_threshold=0.5, calculate_p_value=True, p_value_iterations=15000)
    
    print(we_model.name, weat_res['query_name'], 'WEAT', weat_res['weat'])
    
    rnsb_res = RNSB().run_query(new_query, we_model, calculate_p_value=True, lost_vocabulary_threshold=0.5, p_value_iterations=15000)
    
    print(we_model.name, rnsb_res['query_name'], 'RNSB', rnsb_res['rnsb'])
    

In [None]:
def parse_input_json(file_path):
    """Method to parse a given input file.

    Keyword arguments:
    file_path -- path to file containing the input parameters

    Return:
    data -- parsed data from the input file
    """
    print(file_path)
    # Opening JSON file
    f = open(file_path)
    
    print(f)

    # returns JSON object as a dictionary
    data = json.load(f)

    f.close()

    return data

def fetch_queries(input_queries):
    """Method to fetch queries from data/raw/queries.json file.
    """
    query_sets = []
    for input_query in input_queries:
        query = {}
        if 'queries_path_name' in input_query:
            # When path to a queries file is given

            # declaring local variables for target_sets and attribute_sets
            # query = {}
            target_sets = []
            attribute_sets = []

            # Opening JSON file
            query_file = open(input_query['queries_path_name'])

            # returns queries JSON object as a dictionary
            raw_query_data = json.load(query_file)
            
            target_sets.append(raw_query_data['target_sets'][input_query['target_sets_names'][0]][input_query['target_sets_names'][1]]['set'])
            target_sets.append(raw_query_data['target_sets'][input_query['target_sets_names'][0]][input_query['target_sets_names'][2]]['set'])
            
            # print(raw_query_data['attribute_sets'][input_query['attribute_sets_names'][0]]['set'])
            attribute_sets.append(raw_query_data['attribute_sets'][input_query['attribute_sets_names'][0]]['set'])
            attribute_sets.append(raw_query_data['attribute_sets'][input_query['attribute_sets_names'][1]]['set'])

            # fills up the query dictionary
            query['target_sets'] = target_sets
            query['attribute_sets'] = attribute_sets
            query['target_sets_names'] = [input_query['target_sets_names'][1], input_query['target_sets_names'][2]]
            query['attribute_sets_names'] = input_query['attribute_sets_names']

            # Closing file
            query_file.close()

        else:
            # when query target and attribute sets are explicitly given
            query['target_sets'] = input_query['target_sets']
            query['attribute_sets'] = input_query['attribute_sets']
            query['target_sets_names'] = input_query['target_sets_names']
            query['attribute_sets_names'] = input_query['attribute_sets_names']
                                             
        query_sets.append(query)

    return query_sets


In [None]:
# Parse input file
input_data = parse_input_json("../data/experiments/case1_new_query.json")

# Fetching queries
query_data = fetch_queries(input_data['queries'])

# Evaluate metrics
evaluate_metrics(word2vec, query_data[0], "word2vec")
