In [19]:
import pandas as pd
import ast
from IPython.display import display

In [30]:
train_path = 'train_processed.csv'
test_path = 'test_processed.csv'

In [31]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [32]:
print("Train data sample:")
df_train.head()

Train data sample:


Unnamed: 0,sentenceId,raw_text,aspectTerms
0,241,"As a blistering exercise in sustained tension,...","[{'term': 'Warfare', 'polarity': 'positive'}, ..."
1,1680,Surprisingly very good adaption of the origina...,"[{'term': 'adaption', 'polarity': 'positive'},..."
2,693,Seeing this film was the absolute best decisio...,"[{'term': 'film', 'polarity': 'positive'}, {'t..."
3,421,For a film so clearly designed to be fun above...,"[{'term': 'film', 'polarity': 'neutral'}, {'te..."
4,826,"“Freaky Tales” will find its audience, I think...","[{'term': 'Freaky Tales', 'polarity': 'positiv..."


In [33]:
print("Test data sample:")
df_test.head()

Test data sample:


Unnamed: 0,sentenceId,raw_text,aspectTerms
0,1404,Very good plot with an unexpected ending,"[{'term': 'plot', 'polarity': 'positive'}, {'t..."
1,354,Mickey 17 is an ambitious spectacle. One that ...,"[{'term': 'Mickey 17', 'polarity': 'positive'}..."
2,1344,"An audacious, exhilarating and heartfelt spect...","[{'term': 'spectacle', 'polarity': 'positive'}..."
3,909,What we’ve wound up with in these Freaky Tales...,"[{'term': 'Freaky Tales', 'polarity': 'positiv..."
4,1300,An exhilarating survive-the-night vampire thri...,"[{'term': 'vampire thriller', 'polarity': 'pos..."


In [34]:
def eda_info(df, name):
    print(f"=== {name} Info ===")
    display(df.info())
    print(f"\n{name} description:")
    display(df.describe(include='all'))

In [35]:
eda_info(df_train, 'Train')
eda_info(df_test, 'Test')

=== Train Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548 entries, 0 to 1547
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sentenceId   1548 non-null   int64 
 1   raw_text     1548 non-null   object
 2   aspectTerms  1548 non-null   object
dtypes: int64(1), object(2)
memory usage: 36.4+ KB


None


Train description:


Unnamed: 0,sentenceId,raw_text,aspectTerms
count,1548.0,1548,1548
unique,,1378,1326
top,,Hauser’s performance as a man whose determinat...,"[{'term': 'movie', 'polarity': 'positive'}]"
freq,,4,21
mean,994.133721,,
std,582.01173,,
min,1.0,,
25%,489.75,,
50%,987.5,,
75%,1500.25,,


=== Test Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sentenceId   381 non-null    int64 
 1   raw_text     381 non-null    object
 2   aspectTerms  381 non-null    object
dtypes: int64(1), object(2)
memory usage: 9.1+ KB


None


Test description:


Unnamed: 0,sentenceId,raw_text,aspectTerms
count,381.0,381,381
unique,,367,360
top,,The Luckiest Man in America is good follow-up ...,"[{'term': 'movie', 'polarity': 'positive'}]"
freq,,2,7
mean,982.330709,,
std,579.594478,,
min,24.0,,
25%,479.0,,
50%,978.0,,
75%,1476.0,,


In [36]:
def parse_aspect_terms(x):
    """
    Convert a string representation of a list of dicts into a Python list of dicts.
    Returns empty list on parse errors or if input is empty.
    """
    if isinstance(x, str) and x.strip():
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []
    return []

In [37]:
for df, name in [(df_train, 'Train'), (df_test, 'Test')]:
    df['aspectTerms'] = df['aspectTerms'].apply(parse_aspect_terms)
    print(f"{name} - converted `aspectTerms` sample:")
    display(df[['sentenceId', 'raw_text', 'aspectTerms']].head())

Train - converted `aspectTerms` sample:


Unnamed: 0,sentenceId,raw_text,aspectTerms
0,241,"As a blistering exercise in sustained tension,...","[{'term': 'Warfare', 'polarity': 'positive'}, ..."
1,1680,Surprisingly very good adaption of the origina...,"[{'term': 'adaption', 'polarity': 'positive'},..."
2,693,Seeing this film was the absolute best decisio...,"[{'term': 'film', 'polarity': 'positive'}, {'t..."
3,421,For a film so clearly designed to be fun above...,"[{'term': 'film', 'polarity': 'neutral'}, {'te..."
4,826,"“Freaky Tales” will find its audience, I think...","[{'term': 'Freaky Tales', 'polarity': 'positiv..."


Test - converted `aspectTerms` sample:


Unnamed: 0,sentenceId,raw_text,aspectTerms
0,1404,Very good plot with an unexpected ending,"[{'term': 'plot', 'polarity': 'positive'}, {'t..."
1,354,Mickey 17 is an ambitious spectacle. One that ...,"[{'term': 'Mickey 17', 'polarity': 'positive'}..."
2,1344,"An audacious, exhilarating and heartfelt spect...","[{'term': 'spectacle', 'polarity': 'positive'}..."
3,909,What we’ve wound up with in these Freaky Tales...,"[{'term': 'Freaky Tales', 'polarity': 'positiv..."
4,1300,An exhilarating survive-the-night vampire thri...,"[{'term': 'vampire thriller', 'polarity': 'pos..."


In [38]:
assert df_train['aspectTerms'].apply(lambda x: isinstance(x, list) and all(isinstance(i, dict) for i in x)).all(),  "Conversion failed for some rows in train"
assert df_test['aspectTerms'].apply(lambda x: isinstance(x, list) and all(isinstance(i, dict) for i in x)).all(), "Conversion failed for some rows in test"
print("All `aspectTerms` entries are now list of dicts.")

All `aspectTerms` entries are now list of dicts.


In [39]:
clean_train_path = 'train_processed_cleaned.csv'
clean_test_path = 'test_processed_cleaned.csv'

df_train.to_csv(clean_train_path, index=False)
df_test.to_csv(clean_test_path, index=False)

print(
    "Cleaned files saved to (all original columns preserved):\n"
    f"{clean_train_path}\n"
    f"{clean_test_path}"
)

Cleaned files saved to (all original columns preserved):
train_processed_cleaned.csv
test_processed_cleaned.csv
