In [None]:
import os
import json
import pandas as pd
import re

print(f"Process ID: {os.getpid()}")

In [31]:
# get {} out

# Input and output file paths
input_file = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/generated tweets/tweets_better_6000 copy.jsonl"
output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/generated_tweets_residential_filtered.jsonl"
invalid_lines_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/error_row.jsonl"


# Process the file with better error handling
with open(input_file, "r", encoding="utf-8") as infile, \
     open(output_file, "w", encoding="utf-8") as outfile, \
     open(invalid_lines_file, "w", encoding="utf-8") as errorfile:
    
    for line_number, line in enumerate(infile, start=1):
        try:
            # Parse each line as JSON
            data = json.loads(line.strip())
            # Get the "output" field if it exists
            output = data.get("output", "")
            
            if output:
                # Decode the Unicode characters in the output
                # decoded_output = json.loads(output)
                outfile.write(json.dumps(output, ensure_ascii=False) + "\n")
            else:
                # Log lines where `output` is missing
                print(f"Line {line_number}: Missing 'output' field")
                errorfile.write(f"Line {line_number}: Missing 'output' field\n")
        except json.JSONDecodeError as e:
            # Log invalid JSON lines
            print(f"Skipping line {line_number} due to JSONDecodeError: {e}")
            errorfile.write(f"Line {line_number}: {line.strip()} | Error: {e}\n")


In [32]:
# extracts valid JSON-like content from each line

input_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/generated_tweets_residential_filtered.jsonl"
output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/temp.jsonl"


# Process the file line by line
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line_number, line in enumerate(infile, start=1):
        original_line = line.strip()  # Remove leading and trailing whitespace
        match = re.search(r"{.*?}", original_line, re.DOTALL)
        
        if match:
            # If a match is found, write the cleaned content
            cleaned_text = match.group()
            outfile.write(cleaned_text + "\n")
        else:
            # If no match, append `]}"` and retry
            fixed_line = original_line[:-2] + '}"'  # original_line + "]}\""
            match_retry = re.search(r"{.*?}", fixed_line, re.DOTALL)
            
            if match_retry:
                # If a match is found after fixing, write the cleaned content
                cleaned_text = match_retry.group()
                outfile.write(cleaned_text + "\n")
            else:
                # Log unmatched lines for debugging (optional)
                print(f"Line {line_number}: No valid JSON-like content found even after fixing.")


In [33]:
# Check missing close brackt ], If "]" is not in the line, code attempts to fix the line by replacing the last character with ]}" (closing a JSON structure properly).


input_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/temp.jsonl"
output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/no_missing_brackets.jsonl"

# Process the file and fix lines without `]`
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line_number, line in enumerate(infile, start=1):
        if "]" not in line:
            # Replace the last character with ])" if ] is missing
            fixed_line = line.strip()[:-1] + ']}"' if line.strip() else line
            outfile.write(fixed_line + "\n")
        else:
            # Write the original line if it contains ]
            outfile.write(line)


In [34]:
# Input and output file paths
input_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/no_missing_brackets.jsonl"
output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/between_brackets.jsonl"

# Process the file to extract content between the first [ and ] including the brackets
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line_number, line in enumerate(infile, start=1):
        # Match content between the first [ and the first ]
        match = re.search(r"\[.*?\]", line)
        if match:
            extracted_content = match.group()  # Extract matched content including brackets
            outfile.write(extracted_content + "\n")  # Write the result with brackets
        else:
            print(f"Line {line_number}: No content found between brackets.")


In [35]:
# Input and output file paths
input_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/between_brackets.jsonl"
output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final.jsonl"

# Process the file to remove \n and \ characters
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        # Remove \n and \ characters
        cleaned_line = line.replace("\\n\\", "").replace("\\n", "").replace("\\n ", "").replace("\\n  ", "").replace("\\n   ", "").replace("\\n    ", "").replace("\\", "").replace("  ", "").replace("    ", "")
        # Write the cleaned line to the output file
        outfile.write(cleaned_line )

In [36]:
input_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final.jsonl"
output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final_.jsonl"

# Process the file
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line_number, line in enumerate(infile, start=1):
        # Remove leading/trailing whitespace
        line = line.strip()

        # Check if the second last character is ")"
        if len(line) > 1 and line[-2] == ")":
            # Remove the second last character
            line = line[:-2] + line[-1]
        
        # Check if the second character is "'"
        if len(line) > 1 and line[1] == "'":
            # Remove the second last character
            line = line[0] + '"' + line[2:]
        if len(line) > 1 and line[-2] == "'":
            line = line[:-2] + '"' + line[-1]
        if len(line) > 1 and line[-2] == ",":
            # Remove the second last character
            line = line[:-2] + line[-1]

        # Write the fixed line to the output file
        outfile.write(line + "\n")

In [37]:
# if the second last character is empty, remove that empty space

input_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final_.jsonl"
output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final__.jsonl"

# Process the file
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line_number, line in enumerate(infile, start=1):
        # Remove leading/trailing whitespace
        line = line.strip()

        # Check if the second last character is ")"
        if len(line) > 1 and line[-2] == " ":
            line = line[:-2] + line[-1]
    

        # Write the fixed line to the output file
        outfile.write(line + "\n")

In [22]:
# # if the second last character is not ", then add " in this place

# input_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final__.jsonl"
# output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final___.jsonl"

# # Process the file
# with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
#     for line_number, line in enumerate(infile, start=1):
#         line = line.strip()
#         if len(line) > 1 and line[-2] != ",":
#             line = line[:-2] + '"' + line[-1]
    

#         # Write the fixed line to the output file
#         outfile.write(line + "\n")

In [None]:
# Input file path
input_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final___.jsonl"

# List to store lengths of each list
list_lengths = []

# Process the file and check the length of each list
with open(input_file, "r", encoding="utf-8") as infile:
    for line_number, line in enumerate(infile, start=1):
        try:
            # Parse the line as a JSON list
            tweet_list = json.loads(line.strip())
            if isinstance(tweet_list, list):
                # Get the length of the list and add it to the results
                list_length = len(tweet_list)
                list_lengths.append(list_length)
            else:
                print(f"Line {line_number}: Not a valid list.")
        except json.JSONDecodeError as e:
            print(f"Line {line_number}: JSON decode error - {e}")

# Output the results
print("Lengths of each list:", list_lengths)

In [None]:
# Input file path
input_file = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/generated tweets/building_info_6000.jsonl"

# List to store lengths of "tweet language distribution"
language_distribution_lengths = []

# Process the file
with open(input_file, "r", encoding="utf-8") as infile:
    for line_number, line in enumerate(infile, start=1):
        try:
            # Parse the line as a JSON object
            data = json.loads(line.strip())
            # Extract the "tweet language distribution" field and check its length
            if "tweet_language_distribution" in data and isinstance(data["tweet_language_distribution"], list):
                list_length = len(data["tweet_language_distribution"])
                language_distribution_lengths.append(list_length)
            else:
                print(f"Line {line_number}: 'tweet language distribution' field is missing or not a list.")
        except json.JSONDecodeError as e:
            print(f"Line {line_number}: JSON decode error - {e}")

# Output the results
# print("Lengths of 'tweet language distribution':", language_distribution_lengths)


# def count_mismatches(list1, list2):
#     # Ensure both lists are the same length
#     if len(list1) != len(list2):
#         print("The lists have different lengths.")
#         return None

#     # Count mismatches
#     mismatches = sum(1 for a, b in zip(list1, list2) if a != b)
#     return mismatches

# # Count mismatches
# mismatch_count = count_mismatches(list_lengths, language_distribution_lengths)

# # Output the result

# print("Number of mismatches:", mismatch_count)

# def create_mismatch_dicts(list1, list2):
#     """
#     Finds the indexes of mismatched items between two lists.
#     Returns a list of indexes where mismatches occur.
#     """
#     min_length = min(len(list1), len(list2))
#     dict_greater = {}  # For list1[i] > list2[i]
#     dict_difference = {}  # For list1[i] < list2[i]

#     # Compare elements at the same index
#     for i in range(min_length):
#         if list1[i] > list2[i]:
#             dict_greater[i] = list2[i]
#         elif list1[i] < list2[i]:
#             dict_difference[i] = list2[i] - list1[i]

#     return dict_greater, dict_difference

# greater_dict, difference_dict = create_mismatch_dicts(list_lengths, language_distribution_lengths)




In [152]:
# # Input and output file paths
# input_file = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/remove_bracket_.jsonl"
# output_file = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/remove_bracket_chopped.jsonl"

# # Example greater_dict with indices and corresponding values

# # Process the file
# with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
#     for line_number, line in enumerate(infile, start=1):
#         try:
#             # Parse the line as a JSON list
#             row_list = json.loads(line.strip())
#             if isinstance(row_list, list):
#                 # Check if the current line number has an index in greater_dict
#                 if line_number - 1 in greater_dict:  # Line numbers are 1-based, dict is 0-based
#                     chop_index = greater_dict[line_number - 1]
#                     # Chop the list up to chop_index
#                     row_list = row_list[:chop_index]
#                 # Write the modified list back to the file
#                 outfile.write(json.dumps(row_list, ensure_ascii=False) + "\n")
#             else:
#                 print(f"Line {line_number}: Not a valid list.")
#         except json.JSONDecodeError as e:
#             print(f"Line {line_number}: JSON decode error - {e}")


In [None]:
# # Input and output file paths
# input_file = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/final.jsonl"
# output_file = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/final_final.jsonl"

# # Function to fix lines with a trailing `)`
# def fix_trailing_parenthesis(line):
#     line = line.strip()
#     if line[-2] == ')':  # Check if the last second character is `)`
#         line = line[:-2] + line[-1]  # Remove the last second character
#     return line

# # Process the file and fix lines if needed
# with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
#     for line_number, line in enumerate(infile, start=1):
#         try:
#             # Parse the line as a JSON list
#             tweet_list = json.loads(line.strip())
#             if isinstance(tweet_list, list):
#                 # Get the length of the list and write it back
#                 # print(f"Line {line_number}: Length = {len(tweet_list)}")
#                 outfile.write(json.dumps(tweet_list, ensure_ascii=False) + "\n")
#             else:
#                 print(f"Line {line_number}: Not a valid list.")
#         except json.JSONDecodeError as e:
#             print(f"Line {line_number}: JSON decode error - {e}")
#             # Attempt to fix the line by removing the trailing `)`
#             fixed_line = fix_trailing_parenthesis(line)
#             try:
#                 # Retry parsing the fixed line
#                 tweet_list = json.loads(fixed_line)
#                 if isinstance(tweet_list, list):
#                     print(f"Line {line_number}: Fixed and Length = {len(tweet_list)}")
#                     outfile.write(json.dumps(tweet_list, ensure_ascii=False) + "\n")
#                 else:
#                     print(f"Line {line_number}: Fixed but not a valid list.")
#             except json.JSONDecodeError as retry_error:
#                 print(f"Line {line_number}: Still invalid after fix - {retry_error}")


In [None]:
# # verify if each line is a valid list

# # Define the path to your JSONL file
# input_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final___.jsonl"  # Replace with your actual file path

# # Open and process the file line by line
# with open(input_file, "r", encoding="utf-8") as infile:
#     for line_number, line in enumerate(infile, start=1):
#         try:
#             # Parse the JSON line
#             parsed_data = json.loads(line.strip())

#             # Check if the parsed data is a list
#             if isinstance(parsed_data, list):
#                 None
#             else:
#                 print(f"❌ Line {line_number}: Not a list - Found {type(parsed_data)}")

#         except json.JSONDecodeError as e:
#             print(f"❌ Line {line_number}: Invalid JSON - {e}")

In [None]:
# Input file paths
building_ids_file = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/generated tweets/building_info_6000.jsonl"  # First file with building IDs
tweets_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/final___.jsonl"         # Second file with lists of tweets
output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/paper_dataset.jsonl"             # Output file

# List to store building IDs in their original sequence
building_ids = []

# Read and store building IDs in the order they appear
with open(building_ids_file, "r", encoding="utf-8") as infile:
    for line in infile:
        try:
            data = json.loads(line.strip())
            if "building_id" in data:
                building_ids.append(data["building_id"])
        except json.JSONDecodeError as e:
            print(f"Error parsing building IDs file: {e}")
            
# Ensure building IDs and tweets are combined in the original sequence
with open(tweets_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for building_id, line in zip(building_ids, infile):
        try:
            # Parse the line as a list of tweets
            tweet_list = json.loads(line.strip())
            if isinstance(tweet_list, list):
                # Create a dictionary with the building id as the key
                building_dict = {str(building_id): tweet_list}
                # Write each dictionary as a JSON object on a new line
                outfile.write(json.dumps(building_dict, ensure_ascii=False) + "\n")
        except json.JSONDecodeError as e:
            print(f"Error parsing tweets file: {e}")

print("Building ID to Tweets Mapping saved.")

Merge tweets and building files

In [54]:
# residential_tweet = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/final_residential.jsonl"
# commercial_tweet = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/final_commercial.jsonl"
# residential_building = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/residential_buildings_6558_.jsonl"
# commercial_building = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/commercial_buildings_7980_.jsonl"

# residential_commercial_tweet = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/residential_commercial_tweet"
# residential_commercial_building = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/residential_commercial_building"

tweets = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/paper_dataset.jsonl"
buildings = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/generated tweets/building_info_6000.jsonl"


In [56]:
import json
import ast  # To safely convert string representation of a list into an actual list

# File paths (replace with actual file paths)
metadata_file = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/generated tweets/building_info_6000.jsonl"  # First file with building metadata
tweets_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/paper_dataset.jsonl"      # Second file with tweet data
output_file = "/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/6000/merged_data.jsonl" # Output file

# Step 1: Read the tweets file and store them in a dictionary
tweets_dict = {}

with open(tweets_file, "r", encoding="utf-8") as file:
    for line in file:
        try:
            tweet_data = json.loads(line.strip())  # Parse JSON
            for building_id, tweets in tweet_data.items():
                tweets_dict[int(building_id)] = tweets  # Convert string ID to int
        except json.JSONDecodeError as e:
            print(f"Error parsing tweets file: {e}")

# Step 2: Read the metadata file, merge data, and write the final output
with open(metadata_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        try:
            # Parse the JSON object
            building_data = json.loads(line.strip())

            # Convert 'tweet_language_distribution' from string to actual list if necessary
            if isinstance(building_data.get("tweet_language_distribution"), str):
                try:
                    building_data["tweet_language_distribution"] = ast.literal_eval(building_data["tweet_language_distribution"])
                except (ValueError, SyntaxError):
                    print(f"Warning: Could not parse tweet_language_distribution in line: {line.strip()}")

            # Retrieve tweets based on building_id
            building_id = building_data["building_id"]
            building_data["tweets"] = tweets_dict.get(building_id, [])  # Add tweets, default to empty list if not found

            # Write the merged JSON object to the output file
            json.dump(building_data, outfile, ensure_ascii=False)
            outfile.write("\n")

        except json.JSONDecodeError as e:
            print(f"Error parsing metadata file: {e}")

print("Merging complete! Data saved to 'merged_data.jsonl'.")


Merging complete! Data saved to 'merged_data.jsonl'.
