In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("/content/fact_tokens_SLM.csv")
df.head()

Unnamed: 0,id,L,S,M_surface,M_tags
0,1,janamejaya,janamejayaḥ,janamejaya,SNM
1,2,vac,uvāca,vac,SPs3In
2,3,evam,evam,evam,
3,4,dyūta,dyūta,dyūta,Cp
4,5,ji,jitāḥ,ji,PNPaM


In [4]:
def parse_tag(tag):
    """
    Parses a morphological tag and returns a formatted string of its components.
    This function handles empty/NaN values gracefully.
    """
    if pd.isna(tag) or str(tag).strip() == '':
        return "Note: Probably indeclinable"

    # Define the grammar mappings for the tags
    grammar = {
        # Multi-Character Tokens (checked first)
        'Du': 'Number = Dual', 'Ne': 'Gender = Neuter', 'Pa': 'VerbForm = Part',
        'Gd': 'VerbForm = Gerundive', 'Pr': 'Tense = Present', 'Ps': 'Tense = Past',
        'Fu': 'Tense = Future', 'In': 'Mood = Indicative', 'Im': 'Mood = Imperative',
        'Pv': 'Voice = Passive', 'Cp': 'Compound', 'Co': 'Conjunction',

        # Single-Character Tokens
        'S': 'Number = Singular', 'P': 'Number = Plural', 'N': 'Case = Nominative',
        'G': 'Case = Genitive', 'A': 'Case = Accusative', 'I': 'Case = Instrumental',
        'L': 'Case = Locative', 'V': 'Case = Vocative', 'B': 'Case = Ablative',
        'M': 'Gender = Masculine', 'F': 'Gender = Feminine', 'O': 'Mood = Optative',
        '1': 'Person = 1', '2': 'Person = 2', '3': 'Person = 3',
    }

    # Order tokens by length (longest first) to resolve ambiguity (e.g., 'Ps' vs 'P')
    ordered_tokens = sorted(grammar.keys(), key=len, reverse=True)

    parsed_components = []
    i = 0
    tag_str = str(tag)

    while i < len(tag_str):
        found_token = False
        for token in ordered_tokens:
            if tag_str.startswith(token, i):
                parsed_components.append(grammar[token])
                i += len(token)
                found_token = True
                break
        if not found_token:
            i += 1

    return "; ".join(sorted(parsed_components))

In [5]:
df['morph_expand'] = df['M_tags'].apply(parse_tag)
print("New 'morph_expand' column has been created.")

New 'morph_expand' column has been created.


In [9]:
# --- Step 4: Save the new DataFrame to a new CSV file ---
output_filename = 'fact_tokens_expanded.csv'
df.to_csv(output_filename, index=False)
print(f"Processing complete. The new data has been saved to '{output_filename}'.")

Processing complete. The new data has been saved to 'fact_tokens_expanded.csv'.


In [8]:
df

Unnamed: 0,id,L,S,M_surface,M_tags,morph_expand
0,1,janamejaya,janamejayaḥ,janamejaya,SNM,Case = Nominative; Gender = Masculine; Number ...
1,2,vac,uvāca,vac,SPs3In,Mood = Indicative; Number = Singular; Person =...
2,3,evam,evam,evam,,Note: Probably indeclinable
3,4,dyūta,dyūta,dyūta,Cp,Compound
4,5,ji,jitāḥ,ji,PNPaM,Case = Nominative; Gender = Masculine; Number ...
...,...,...,...,...,...,...
673,674,niśā,niśām,niśā,SAF,Case = Accusative; Gender = Feminine; Number =...
674,675,rājan,rājā,rājan,SNM,Case = Nominative; Gender = Masculine; Number ...
675,676,duḥkha,duḥkha,duḥkha,Cp,Compound
676,677,śoka,śoka,śoka,Cp,Compound


# dividing morphos into its distinct individual columns

In [None]:
import pandas as pd
import numpy as np

# --- Re-load data if starting a new session ---
try:
    df = pd.read_csv('fact_tokens_expanded.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'fact_tokens_expanded.csv' not found.")
# ---------------------------------------------

def parse_morph_expand(text):
    """
    Parses the 'morph_expand' string into a dictionary of its features.
    Example: 'Case = Nominative; Gender = Masculine' -> {'Case': 'Nominative', 'Gender': 'Masculine'}
    """
    # Return an empty dictionary if the input is not a string (e.g., NaN)
    if not isinstance(text, str):
        return {}

    features = {}
    # Split by semicolon to get individual 'key = value' pairs
    parts = text.split(';')
    for part in parts:
        # Check if there is an '=' sign to split on
        if '=' in part:
            # Split only on the first '=', in case the value also has an '='
            key, value = part.split('=', 1)
            # .strip() removes any leading/trailing whitespace
            features[key.strip()] = value.strip()
    return features

print("Morphological parsing function created.")

#### **Step 3.2: Apply the Function and Expand into Columns**

# Now we will apply this function to every row in the `morph_expand` column. This will create a list of dictionaries. Then, pandas can magically expand this list of dictionaries directly into new columns for us.

# python
if 'df' in locals():
    # 1. Apply the function to create a list of dictionaries
    morph_dicts = df['morph_expand'].apply(parse_morph_expand).tolist()

    # 2. Convert the list of dictionaries into a new DataFrame
    morph_features_df = pd.DataFrame(morph_dicts)

    # 3. Concatenate our new feature columns with the original DataFrame
    df_expanded = pd.concat([df, morph_features_df], axis=1)

    # Let's see the result!
    print("Successfully expanded morphological features into new columns.")
    print("New DataFrame shape:", df_expanded.shape)

    # Display the first few rows with the new columns
    # (You might need to scroll right in Colab's output to see them all)
    pd.set_option('display.max_columns', None) # To see all columns
    print(df_expanded.head())

Dataset loaded successfully.
Morphological parsing function created.
Successfully expanded morphological features into new columns.
New DataFrame shape: (678, 14)
   id           L            S   M_surface  M_tags  \
0   1  janamejaya  janamejayaḥ  janamejaya     SNM   
1   2         vac        uvāca         vac  SPs3In   
2   3        evam         evam        evam     NaN   
3   4       dyūta        dyūta       dyūta      Cp   
4   5          ji        jitāḥ          ji   PNPaM   

                                        morph_expand        Case     Gender  \
0  Case = Nominative; Gender = Masculine; Number ...  Nominative  Masculine   
1  Mood = Indicative; Number = Singular; Person =...         NaN        NaN   
2                        Note: Probably indeclinable         NaN        NaN   
3                                           Compound         NaN        NaN   
4  Case = Nominative; Gender = Masculine; Number ...  Nominative  Masculine   

     Number        Mood Person Tense 

In [None]:
# NEW -- Step 3.3: Creating a 'Word_Type' Column

def assign_word_type(row):
    morph_text = row['morph_expand']
    if not isinstance(morph_text, str):
        return 'Standard' # It has parsed features, so it's a standard word
    if 'Compound' in morph_text:
        return 'Compound'
    if 'indeclinable' in morph_text:
        return 'Indeclinable'
    if 'conjuction' in morph_text:
        return 'Conjunction'
    # Default case for words that have features but are not compounds/indeclinables
    return 'Standard'

# Apply this function to create our new column
df_expanded['Word_Type'] = df_expanded.apply(assign_word_type, axis=1)

print("\n'Word_Type' column created successfully.\n")

# Let's check the distribution of our new categories
print("Value counts for 'Word_Type':")
print(df_expanded['Word_Type'].value_counts())

# Let's look at a few examples of compounds
print("\n--- Example of 'Compound' words ---")
print(df_expanded[df_expanded['Word_Type'] == 'Compound'].head())


'Word_Type' column created successfully.

Value counts for 'Word_Type':
Word_Type
Standard        450
Indeclinable    122
Compound        106
Name: count, dtype: int64

--- Example of 'Compound' words ---
    id         L         S M_surface M_tags morph_expand Case Gender Number  \
3    4     dyūta     dyūta     dyūta     Cp     Compound  NaN    NaN    NaN   
24  25     pūrva     pūrva     pūrva     Cp     Compound  NaN    NaN    NaN   
28  29  aiśvarya  aiśvarya  aiśvarya     Cp     Compound  NaN    NaN    NaN   
36  37     śakra     śakra     śakra     Cp     Compound  NaN    NaN    NaN   
37  38   pratima   pratima   pratima     Cp     Compound  NaN    NaN    NaN   

   Mood Person Tense VerbForm Voice Word_Type  
3   NaN    NaN   NaN      NaN   NaN  Compound  
24  NaN    NaN   NaN      NaN   NaN  Compound  
28  NaN    NaN   NaN      NaN   NaN  Compound  
36  NaN    NaN   NaN      NaN   NaN  Compound  
37  NaN    NaN   NaN      NaN   NaN  Compound  


In [None]:
df_expanded

Unnamed: 0,id,L,S,M_surface,M_tags,morph_expand,Case,Gender,Number,Mood,Person,Tense,VerbForm,Voice,Word_Type
0,1,janamejaya,janamejayaḥ,janamejaya,SNM,Case = Nominative; Gender = Masculine; Number ...,Nominative,Masculine,Singular,,,,,,Standard
1,2,vac,uvāca,vac,SPs3In,Mood = Indicative; Number = Singular; Person =...,,,Singular,Indicative,3,Past,,,Standard
2,3,evam,evam,evam,,Note: Probably indeclinable,,,,,,,,,Indeclinable
3,4,dyūta,dyūta,dyūta,Cp,Compound,,,,,,,,,Compound
4,5,ji,jitāḥ,ji,PNPaM,Case = Nominative; Gender = Masculine; Number ...,Nominative,Masculine,Plural,,,,Part,,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673,674,niśā,niśām,niśā,SAF,Case = Accusative; Gender = Feminine; Number =...,Accusative,Feminine,Singular,,,,,,Standard
674,675,rājan,rājā,rājan,SNM,Case = Nominative; Gender = Masculine; Number ...,Nominative,Masculine,Singular,,,,,,Standard
675,676,duḥkha,duḥkha,duḥkha,Cp,Compound,,,,,,,,,Compound
676,677,śoka,śoka,śoka,Cp,Compound,,,,,,,,,Compound


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set a professional style for our plots
sns.set(style="whitegrid", font_scale=1.2)
plt.rcParams['figure.figsize'] = (12, 6)

# Load the dataset
# Make sure 'fact_tokens_expanded.csv' is in the same directory or provide the correct path
try:
    df = pd.read_csv('fact_tokens_expanded.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'fact_tokens_expanded.csv' not found. Please upload the file to your Colab environment.")

# Display the first few rows to understand its structure
if 'df' in locals():
    print(df.head())

Dataset loaded successfully!
   id           L            S   M_surface  M_tags  \
0   1  janamejaya  janamejayaḥ  janamejaya     SNM   
1   2         vac        uvāca         vac  SPs3In   
2   3        evam         evam        evam     NaN   
3   4       dyūta        dyūta       dyūta      Cp   
4   5          ji        jitāḥ          ji   PNPaM   

                                        morph_expand  
0  Case = Nominative; Gender = Masculine; Number ...  
1  Mood = Indicative; Number = Singular; Person =...  
2                        Note: Probably indeclinable  
3                                           Compound  
4  Case = Nominative; Gender = Masculine; Number ...  
