***Sampling***

In [None]:
import json
import os
import tqdm

def sample_json(input_filename, output_filename, target_size_gb=15, filter_key='also_buy'):
    target_size_bytes = target_size_gb * (1024 ** 3)
    total_bytes = 0
    extracted_gb = 0  # Variable to track extracted gigabytes

    with open(input_filename, 'r', encoding='utf-8') as input_file:
        with open(output_filename, 'w', encoding='utf-8') as output_file:
            for line in tqdm.tqdm(input_file):
                record = json.loads(line)

                # Check if the record satisfies the filter condition
                if filter_key is not None and filter_key in record and record[filter_key]:
                    json.dump(record, output_file)
                    output_file.write('\n')

                    # Calculate the size of the written record and update total bytes
                    total_bytes += len(line.encode('utf-8'))

                    # Increment the extracted gigabytes counter
                    if total_bytes >= target_size_bytes:
                        extracted_gb += 1
                        print(f"Currently extracting gb#{extracted_gb}", end='\r')

    # Display total GB processed after completion
    print(f"Total GB processed: {extracted_gb} GB")

# Example usage:
sample_json('E:\All_Amazon_Meta.json\All_Amazon_Meta.json', 'sampled_original_amazon.json')


***Uploading the json file***

In [None]:
import string
import json

def get_total_records(file_path):
    with open(file_path, 'r') as file:
        total_records = sum(1 for line in file)
    return total_records


***Pre-processing (I)***

In [None]:

# Simplified word tokenization
def word_tokenize(text):
    return text.split()

# Simplified stemming using only the first few characters
def stem_word(word):
    return word[:4]  # Just take the first 4 characters as a simple stemming approach

# Function to read stopwords from a file
def read_stopwords_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        stopwords = [line.strip() for line in file]
    return stopwords

# Function to remove stopwords
def remove_stopwords(tokens, stopwords):
    return [token for token in tokens if token not in stopwords]

# Function to remove punctuation
def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation])


***Pre-processing (II)***

In [None]:

# Simplified preprocessing function
def preprocess_text(text, stopwords):
    # Lowercasing
    text = text.lower()

    # Removing Punctuation
    text = remove_punctuation(text)

    # Tokenization
    tokens = word_tokenize(text)

    # Removing Stopwords
    tokens = remove_stopwords(tokens, stopwords)

    # Stemming
    stemmed_tokens = [stem_word(token) for token in tokens]

    return ' '.join(stemmed_tokens)  # Return preprocessed text as a single string

***Chunk Processing***

In [None]:

# Function to process data in chunks
def process_data_in_chunks(input_file, output_file, total_records, records_per_chunk, stopwords):
    line_index = 0
    chunk_count = 0
    while True:
        chunk = []
        for _ in range(records_per_chunk):
            line = input_file.readline()
            if not line:
                break
            chunk.append(line)
            line_index += 1
        if not chunk:
            break
        chunk_count += 1
        filtered_chunk = []
        for line in chunk:
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                # Skip invalid JSON lines
                continue
            filtered_data = {key: data[key] for key in columns_to_keep if key in data}
            # Preprocess text data
            for key, value in filtered_data.items():
                if isinstance(value, str):
                    filtered_data[key] = preprocess_text(value, stopwords)
            filtered_chunk.append(filtered_data)
        json.dump(filtered_chunk, output_file)
        print(f"Chunk {chunk_count}: {line_index} records filtered. ({(line_index / total_records) * 100:.2f}% done)")


***Function Calling***

In [None]:

# Specify the file paths
input_json_path = "E:\sampled_original_amazon\sampled_original_amazon.json"
output_json_path = "filtered_data.json"
stopwords_file = "C:\\Users\\admin\\Documents\\english.txt"

# Define the columns to keep
columns_to_keep = ['asin', 'title','price', 'also_buy', 'also_view']

# Print statement for loading product data
print("Filtering product data...")

# Read stopwords from the file
stopwords = read_stopwords_from_file(stopwords_file)

# Get total records in the JSON file
total_records = get_total_records(input_json_path)

# Calculate records per chunk based on total records
records_per_chunk = min(total_records, 100000)  # Set a maximum of 100000 records per chunk

# Read from input JSON file in chunks, filter, preprocess, and write to output JSON file
with open(input_json_path, 'r') as input_file, open(output_json_path, 'w') as output_file:
    process_data_in_chunks(input_file, output_file, total_records, records_per_chunk, stopwords)

# Print statement indicating completion
print("Filtered and preprocessed data saved to 'filtered_data.json'")
