In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pyspark py4j



In [None]:
input_file_path = "/content/drive/MyDrive/myData.txt"
set_count_to_deduct_for_incident = 500

In [None]:
# Define the file paths
output_file_path = "/content/drive/MyDrive/accomodation_lines.txt"

# Open the input file and write the first 500,000 lines to a new file with line numbers
# with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
#     for i, line in enumerate(infile):
#         if i < 500000:
#             # Add line number to the beginning of each line
#             outfile.write(f"{i+1} {line}")
#         else:
#             break

with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for i, line in enumerate(infile):
        outfile.write(f"{i+1} {line}")

# print(f"First 500,000 lines with line numbers have been written to {output_file_path}")


In [None]:
# Define the file paths
input_file_path = "/content/drive/MyDrive/accomodation_lines.txt"

# Read the file into memory (assuming it fits in memory for this operation)
with open(input_file_path, 'r') as infile:
    lines = infile.readlines()

nan_indices = [i for i, line in enumerate(lines) if 'NaN' in line]

cutoff_index = nan_indices[0]

if set_count_to_deduct_for_incident == 0:
    cutoff_index = len(lines)
elif len(nan_indices) >= set_count_to_deduct_for_incident:
    cutoff_index = nan_indices[-set_count_to_deduct_for_incident]

print(f"Set count to deduct: {set_count_to_deduct_for_incident}")

print(f"Cutoff index: {cutoff_index}")

print(f"Number of NaN containing lines: {len(nan_indices)}")

Set count to deduct: 500
Cutoff index: 7530229
Number of NaN containing lines: 2502


In [None]:
from pyspark import SparkContext

# Initialize SparkContext
sc = SparkContext.getOrCreate()

# Load the file as an RDD
accomodation_file_path = "/content/drive/MyDrive/accomodation_lines.txt"

accomodation_rdd = sc.textFile(accomodation_file_path)

# Split the file into lines and identify the header
header = accomodation_rdd.first()

# Skip the header and split the lines into columns
accomodation_data_rdd = accomodation_rdd.filter(lambda line: line != header).map(lambda line: line.split())

# Filter out rows with NaN values
accomodation_cleaned_rdd = accomodation_data_rdd.filter(lambda row: not any(value == "NaN" for value in row))

In [None]:
def validate_and_map(row):
    # Ensure row has at least 3 elements
    if len(row) >= 3:
        return (row[1], (row[0], row[2]))
    else:
        return None  # Invalid rows will be filtered out

accomodation_key_value_rdd = accomodation_cleaned_rdd.map(validate_and_map).filter(lambda x: x is not None)

accomodation_grouped_rdd = accomodation_key_value_rdd.groupByKey()

In [None]:
## REMOVE THIS BLOCK AFTER FINALIZING
# Convert the grouped data to a readable format
accomodation_grouped_data = accomodation_grouped_rdd.mapValues(list)

# Show the first few rows in readable format
print("Accomodation Grouped Data:")
for line in accomodation_grouped_data.take(5):
    print(line)

Accomodation Grouped Data:
('1615', [('26', '73.076'), ('3859', '73.414'), ('7694', '73.766'), ('11515', '73.996'), ('15327', '74.075'), ('19148', '74.086'), ('22962', '74.223'), ('26755', '74.25'), ('30557', '74.356'), ('34354', '74.78'), ('38158', '75.04'), ('41952', '75.173'), ('45737', '75.168'), ('49531', '75.042'), ('53317', '74.969'), ('57104', '75.018'), ('60891', '75.022'), ('64670', '74.929'), ('68462', '74.727'), ('72243', '74.323'), ('76028', '73.844'), ('79799', '73.549'), ('83549', '73.318'), ('87316', '73.039'), ('91084', '72.705'), ('94831', '72.298'), ('109827', '72.193'), ('113559', '72.202'), ('117327', '72.199'), ('121102', '72.209'), ('124889', '72.212'), ('128689', '72.29'), ('132492', '72.382'), ('136306', '72.307'), ('140122', '72.269'), ('209538', '72.177'), ('213411', '72.282'), ('217270', '72.248'), ('221110', '72.226'), ('252119', '72.426'), ('255998', '72.495'), ('259864', '72.395'), ('263730', '72.456'), ('267620', '72.358'), ('294874', '72.422'), ('298783

In [None]:
accomodation_plane_Right = 73.2186
accomodation_plane_Left = 2.709
incident_plane_Right = accomodation_plane_Right + 20
incident_plane_Left = accomodation_plane_Left - 20

In [None]:
def inside_accomodation(x_position):
    return float(x_position) > accomodation_plane_Left and float(x_position) < accomodation_plane_Right

def inside_incident(x_position):
    return float(x_position) > incident_plane_Left and float(x_position) < incident_plane_Right

def find_accomodation_or_incident(id, values):
    if not values:
        return 0

    sorted_values = sorted(values, key=lambda x: int(x[0]))
    accomodation_count = 0
    incident_count = 0
    current_state = -1
    for line, x_position in sorted_values:
        # with open('states.txt', 'a') as f:
        #   f.write(f"Line: {line}, X Position: {x_position}\n")
        if inside_accomodation(x_position):
            if current_state == 1:
                accomodation_count += 1
                current_state = 2
                with open('states.txt', 'a') as f:
                  f.write(f"{id} Mol crossed the accomodation plane on line: {line}, {x_position}\n")
        elif inside_incident(x_position):
            if current_state == 0 and int(line)<=cutoff_index:
                incident_count += 1
                current_state = 1
                with open('states.txt', 'a') as f:
                  f.write(f"{id} Mol crossed the incident plane on line: {line}, {x_position}\n")
        else:
            current_state = 0
    return [incident_count, accomodation_count]

accomodation_rdd = accomodation_grouped_rdd.map(lambda x: [x[0], find_accomodation_or_incident(x[0], list(x[1]))])


In [None]:
## REMOVE THIS BLOCK AFTER FINALIZING
accomodations = accomodation_rdd.collect()

print("Accomodations:")
for id, counts in accomodations:
    print(f"ID {id}: {counts}")
    break
    accomodation_count, incident_count = counts[1], counts[0]
    print(f"ID {id}: accomodation count = {accomodation_count}")
    if accomodation_count!=0:
        break

Accomodations:
ID 1615: [0, 0]


In [None]:
# Aggregate the total accommodation and incident counts
total_counts = accomodation_rdd.map(lambda x: x[1]).reduce(lambda a, b: (a[0] + b[0], a[1] + b[1]))

# Calculate the overall ratio
total_incidents, total_accommodations = total_counts
overall_ratio = total_accommodations / total_incidents if total_incidents != 0 else "Infinity"

# Display the results
print(f"Total Accommodations: {total_accommodations}")
print(f"Total Incidents: {total_incidents}")
print(f"Overall Accommodation-to-Incident Ratio: {overall_ratio}")

Total Accommodations: 28
Total Incidents: 30
Overall Accommodation-to-Incident Ratio: 0.9333333333333333
