In [None]:
import pandas as pd
import re
import json

# Load the CSV dataset
df = pd.read_csv("Task_B_Dataset.csv")

# Helper function to extract phrases matching the given pattern
def extract_patterns(raw_text, pattern_type):
    words_tags = raw_text.split()
    pattern1_regex = re.compile(r"(in/nn(?:/nn)*)")
    pattern2_regex = re.compile(r"(jj/nn(?:/nn)*)")

    extracted_phrases = []
    phrase_type = ""

    # Iterate through the sentence
    for i in range(len(words_tags)):
        current_phrase = []
        for j in range(i, len(words_tags)):
            tag_sequence = [wt.split("/")[1] for wt in words_tags[i:j+1]]
            word_sequence = [wt.split("/")[0] for wt in words_tags[i:j+1]]

            # Check for Pattern 1: in + nn nn ...
            if pattern_type == "pattern 1" and tag_sequence[0] == "in" and all(tag == "nn" for tag in tag_sequence[1:]):
                phrase_type = "in " + " ".join(tag_sequence[1:])
                extracted_phrases.append({
                    "begin": len(" ".join(words_tags[:i])) + (1 if i > 0 else 0),
                    "end": len(" ".join(words_tags[:j+1])),
                    "text": " ".join(word_sequence),
                    "phrase_type": phrase_type
                })

            # Check for Pattern 2: jj + nn nn ...
            elif pattern_type == "pattern 2" and tag_sequence[0] == "jj" and all(tag == "nn" for tag in tag_sequence[1:]):
                phrase_type = "jj " + " ".join(tag_sequence[1:])
                extracted_phrases.append({
                    "begin": len(" ".join(words_tags[:i])) + (1 if i > 0 else 0),
                    "end": len(" ".join(words_tags[:j+1])),
                    "text": " ".join(word_sequence),
                    "phrase_type": phrase_type
                })

    # Select the largest matching phrase
    if extracted_phrases:
        extracted_phrases = sorted(extracted_phrases, key=lambda x: len(x["text"]), reverse=True)
        return [extracted_phrases[0]]  # Only select the largest match

    return []

# Process sentences to generate JSON output
def generate_json_output(df, pattern_type):
    results = {
        "pattern": pattern_type,
        "sents": []
    }

    for _, row in df.iterrows():
        filename, para_id, sent_id, raw_text = row["filename"], row["para_id"], row["sent_id"], row["raw_text"]
        phrases = extract_patterns(raw_text, pattern_type)

        if phrases:
            sent_text = " ".join([wt.split("/")[0] for wt in raw_text.split()])

            results["sents"].append({
                "filename": filename,
                "para_id": para_id,
                "sent_id": sent_id,
                "sent_text": sent_text,
                "phrases": phrases
            })

    return results

# Generate JSON output for Pattern 1
pattern1_results = generate_json_output(df, "pattern 1")

# Generate JSON output for Pattern 2
pattern2_results = generate_json_output(df, "pattern 2")

# Save Pattern 1 results to JSON file
with open("pattern1_results.json", "w") as f1:
    json.dump(pattern1_results, f1, indent=4)

# Save Pattern 2 results to JSON file
with open("pattern2_results.json", "w") as f2:
    json.dump(pattern2_results, f2, indent=4)

print("Pattern extraction and JSON generation complete!")