In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import pandas as pd
from os import path
from underthesea import word_tokenize

In [None]:
!pip install underthesea

Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m41.1 MB/s[0m eta [36m

In [None]:

accented_chars = 'àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ'
unaccented_chars = 'aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU'
accented_vowels_table = [
    ['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
    ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
    ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
    ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
    ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
    ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
    ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
    ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
    ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
    ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
    ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
    ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y'],
]


def generate_charset_conversion_dict():
    """
    Loads a dictionary mapping characters in Windows-1252 encoding
    to their corresponding UTF-8 characters.
    """

    dic = {}
    windows_1252_chars = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|'
    )
    utf8_chars = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|'
    )

    for i in range(len(windows_1252_chars)):
        dic[windows_1252_chars[i]] = utf8_chars[i]

    return dic


def generate_vowel_indexes_dict():
    """
    Loads a dictionary mapping accented vowels
    to their corresponding position in the accented_vowels_table.
    """

    vowel_indexes_dict = {}

    for i in range(len(accented_vowels_table)):
        for j in range(len(accented_vowels_table[i]) - 1):
            accented_vowel = accented_vowels_table[i][j]
            vowel_indexes_dict[accented_vowel] = (i, j)

    return vowel_indexes_dict


utf8_chars_dict = generate_charset_conversion_dict()
accented_vowel_indexes_dict = generate_vowel_indexes_dict()


def remove_special_characters(original_string: str):
    """
    Remove special characters from the input string,
    keeping only letters, digits, whitespace, and accented Latin characters.
    """

    return re.sub(r'[^\s\dA-Za-zà-ỹĐ]', ' ', original_string)


def remove_redundant_white_spaces(original_string: str):
    """
    Remove redundant white spaces at the start, at the end of the string and between words.
    """

    return re.sub(r'\s+', ' ', original_string).strip()


def remove_duplicate_characters(original_string: str):
    """
    Remove consecutive intentionally duplicated characters in each word (ignoring case),
    any consecutive group of duplicated characters will be replaced by the first occurrence.
    """

    return re.sub(r'([A-Z])\1+', lambda m: m.group()[0], original_string, flags=re.IGNORECASE)


def convert_to_unicode_string(original_string: str):
    """
    Convert accented characters from Windows-1252 encoding to UTF-8,
    despite looking the same, they can represent different underlying byte sequences.
    """

    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda match: utf8_chars_dict[match.group()],
        original_string,
    )


def convert_to_lowercase(original_string: str):
    """
    Convert original string to lowercase.
    """

    return original_string.lower()


def is_valid_vietnamese_word(word: str):
    """
    Check whether a word is a valid Vietnamese word.
    """

    characters = list(word)
    current_vowel_index = -1
    default_indexes = (-1, -1)

    for index, char in enumerate(characters):
        x, _ = accented_vowel_indexes_dict.get(char, default_indexes)

        # Valid Vietnamese accented vowel check (ignore consonants and invalid characters)
        if x == -1:
            continue

        # Ignore the 1st vowel of the word
        # If the second vowel is not placed right after the previous one => Invalid Vietnamese word
        if current_vowel_index != -1 and index - current_vowel_index != 1:
            return False

        current_vowel_index = index

    # If the word has no vowel => Invalid Vietnamese word
    return current_vowel_index != -1


def normalize_tone_mark_in_word(original_word: str):
    """
    Put tone mark of the word in the correct position according to Vietnamese rules.
    """

    # Ignore invalid Vietnamese word, including abbreviations, foreign languagues, misspelled words, etc
    if not is_valid_vietnamese_word(original_word):
        return original_word

    characters = list(original_word)
    vowel_indexes = []
    tone_mark_index = 0
    default_indexes = (-1, -1)
    startswith_gi_or_qu_consonant = False

    # Removing tone mark of the word, tracking tone mark and vowels positions
    for index, char in enumerate(characters):
        x, y = accented_vowel_indexes_dict.get(char, default_indexes)

        # Valid Vietnamese accented vowel check (ignore consonants and invalid characters)
        if x == -1:
            continue

        # Special case "gi"
        if x == 5 and index != 0 and characters[index - 1] == 'g':
            characters[index] = 'i'
            startswith_gi_or_qu_consonant = True

        # Special case "qu"
        if x == 9 and index != 0 and characters[index - 1] == 'q':
            characters[index] = 'u'
            startswith_gi_or_qu_consonant = True

        # Tracking the tone mark's index (if any)
        if y != 0:
            tone_mark_index = y
            unaccented_vowel = accented_vowels_table[x][0]
            characters[index] = unaccented_vowel

        # Tracking the indexes of all vowels
        # Ignore "i" and "u" in "gi" and "qu" compound consonants
        if not startswith_gi_or_qu_consonant or index >= 2:
            vowel_indexes.append(index)

    # Handle less than 2 vowels
    # Eg: "gì", "giờ", "gìn", "tá", "tan", etc
    if len(vowel_indexes) < 2:
        if not startswith_gi_or_qu_consonant or len(characters) == 2:
            return original_word

        # Having "gi" or "qu" compound consonant and length >= 3
        x, y = accented_vowel_indexes_dict.get(characters[2], default_indexes)

        # If the word still has a vowel => that vowel will carry the tone mark
        # Else: that "gi" or "qu" will carry the tone mark => same as original word
        if x != -1:
            characters[2] = accented_vowels_table[x][tone_mark_index]
        else:
            return original_word

        return ''.join(characters)

    # Handle 2+ vowels
    if len(vowel_indexes) >= 2:
        x, _ = accented_vowel_indexes_dict.get(characters[-1], default_indexes)

        # Ends with a consonant  => last vowel will carry tone mark
        # Eg: "quyến", "tiếng", "việt", "ngoan", "hiền", "nghiêng", etc
        if x == -1:
            tone_mark_position = vowel_indexes[-1]
        # Ends without a consonant => 2nd last vowel will carry tone mark
        # Eg: "giời", "quái", "quầy", "hòa", "người", "cái", "tưới", etc
        else:
            tone_mark_position = vowel_indexes[-2]

        x, _ = accented_vowel_indexes_dict.get(characters[tone_mark_position])
        characters[tone_mark_position] = accented_vowels_table[x][tone_mark_index]
        return ''.join(characters)


def normalize_tone_marks_in_sentence(original_sentence: str):
    """
    Put tone marks in the correct positions according to Vietnamese rules.
    """

    words = original_sentence.split()
    normalized_words = [normalize_tone_mark_in_word(word) for word in words]
    normalized_sentence = ' '.join(normalized_words)

    return normalized_sentence


In [None]:

def convert_to_word_phrares_string(original_string: str):
    """
    Convert original string to string with word phrases.
    """

    return word_tokenize(original_string, format='text')


# stopwords_df = pd.read_csv(stopwords_data_path, names=['word'])
# stopwords_list = stopwords_df['word'].tolist()


def remove_stopwords(original_string: str):
    """
    Remove stopwords in the string.
    """

    words = original_string.split()
    non_stopwords = []

    for word in words:
        if word not in stopwords_list:
            non_stopwords.append(word)

    return ' '.join(non_stopwords)


def preprocess_string(document: str):
    """
    Apply transformations to orginal string,
    the modified string will be suitable for training models.
    """

    # Removing emojis, special characters, commas, dots, etc
    document = remove_special_characters(document)

    # Remove redundant white spaces
    document = remove_redundant_white_spaces(document)

    # Convert string to lowercase
    document = convert_to_lowercase(document)

    # Remove consecutive duplicated characters
    document = remove_duplicate_characters(document)

    # Convert string to unicode standard before applying further transformations
    document = convert_to_unicode_string(document)

    # Put tone marks in the correct positions
    document = normalize_tone_marks_in_sentence(document)

    # Convert to string with word phrases
    document = convert_to_word_phrares_string(document)

    # # Remove stopwords
    # document = remove_stopwords(document)

    return document


In [None]:
dfLy = pd.read_csv("/content/drive/MyDrive/ABSA/DatasetRaw/LabelingLy.csv")
dfQ = pd.read_csv("/content/drive/MyDrive/ABSA/DatasetRaw/LabelingQuynh.csv")
dfgpt = pd.read_csv("/content/drive/MyDrive/ABSA/DatasetRaw/datagpt.csv")

In [None]:
dfQ = dfQ[:2001]

In [None]:
dfQ

Unnamed: 0,character,plot,scene,sound,Cmt,Unnamed: 5
0,P,O,P,P,Cũng tạm ổn khúc cuối hơi không hiểu cần làm r...,
1,O,P,P,P,Phim khá ổn so với thể loại phim kinh dị của V...,
2,P,P,P,P,Lần đầu làm chuyện ấy phim hay kinh dị nhưng k...,
3,P,P,P,N,"Thấy phim OK mà bị mọi người chê dữ, có bám th...",
4,P,N,N,N,"Phim lan man. góc quay siêu xấu luôn ấy, nhìn ...",
...,...,...,...,...,...,...
1996,X,N,P,P,"♾️ điểm nha mọi người, nội dung dễ đoán, tình ...",
1997,P,X,X,X,"Phim hay lắm nha, ông chú diễn quá xuất sắc. N...",
1998,X,X,X,X,Không biết nói gì thêmmm =))) quá tuyệt dzờiii,
1999,X,X,X,X,phim hay lắm,


In [None]:
dfLy = dfLy.rename(columns={" Acting": "character", "Plot": "content", "Scene": "scene", "Sound": "sound"})
dfQ  = dfQ.rename(columns={"plot": "content"})
dfgpt  = dfgpt.rename(columns={"plot": "content", "cmt": "Cmt"})


In [None]:
dfgpt

Unnamed: 0,Cmt,character,content,scene,sound
0,"Âm thanh trong phim quá tệ, không rõ ràng và g...",X,X,X,N
1,"Hệ thống âm thanh không đồng bộ với hình ảnh, ...",X,X,X,N
2,"Nhạc nền quá ồn ào, che lấp hết các đoạn hội t...",X,X,X,N
3,"Hiệu ứng âm thanh thiếu chiều sâu, nghe rất gi...",X,X,X,N
4,"Âm thanh nền lặp đi lặp lại, không tạo được sự...",X,X,X,N
...,...,...,...,...,...
1094,Cảnh phim tuy không quá đặc sắc nhưng đủ để là...,X,X,O,X
1095,"Bối cảnh trong phim được thiết kế đơn giản, gi...",X,X,O,X
1096,"Cảnh phim được bố trí hợp lý, không làm mất đi...",X,X,O,X
1097,Bối cảnh trong phim không quá nổi bật nhưng kh...,X,X,O,X


In [None]:
dfLy.columns

Index(['character', 'content', 'scene', 'sound', 'Cmt'], dtype='object')

In [None]:
df = pd.concat([dfQ, dfLy, dfgpt], ignore_index=True)
df = df.drop(columns=['Unnamed: 5'])
df.columns

Index(['character', 'content', 'scene', 'sound', 'Cmt'], dtype='object')

In [None]:
df[df["character"].isnull()]

Unnamed: 0,character,content,scene,sound,Cmt
4488,,X,X,N,"Lồng tiếng không đều, giọng lúc to lúc nhỏ khô..."


In [None]:
df["character"].iloc[4488]='X'


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["character"].iloc[4488]='X'


In [None]:
filtered_df = df[df.apply(lambda col: col == 'x').any(axis=1)]
filtered_df

Unnamed: 0,character,content,scene,sound,Cmt
2111,O,x,x,x,Cảm thấy nhân vật nam ko có gì cuốn hút làm sự...
2278,x,x,x,x,Đi coi ngay 8/3 ta nói cười banh rạp :))) phim...


In [None]:
df["character"].iloc[2111]='X'
df["content"].iloc[2111]='X'
df["scene"].iloc[2111]='X'
df["sound"].iloc[2111]='X'

df["content"].iloc[2278]='X'
df["scene"].iloc[2278]='X'
df["sound"].iloc[2278]='X'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["character"].iloc[2111]='X'
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, 

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4762 entries, 0 to 4761
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   character  4762 non-null   object
 1   content    4762 non-null   object
 2   scene      4762 non-null   object
 3   sound      4762 non-null   object
 4   Cmt        4762 non-null   object
dtypes: object(5)
memory usage: 186.1+ KB


In [None]:
preprocess_string(df["Cmt"].iloc[1])

'phim khá ổn so với thể_loại phim kinh_dị của việt_nam bối_cảnh và trang_phục đẹp nhạc phim hay đây là thể_loại kinh_dị tâm_lý đánh vào nội_tâm con_người để khơi nguồn nỗi sợ mình thấy nhiều bạn bảo là chưa đủ đô kinh_dị và chê phim khá nặng_nề vậy chắc'

In [None]:
path = "/content/drive/MyDrive/ABSA/DatasetRaw"

In [None]:
df["processed_cmt"] = df["Cmt"].apply(preprocess_string)

In [None]:
df.to_csv(f'{path}/Processed_full_data.csv', encoding="utf-8-sig", index = False)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4762 entries, 0 to 4761
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   character      4762 non-null   object
 1   content        4762 non-null   object
 2   scene          4762 non-null   object
 3   sound          4762 non-null   object
 4   Cmt            4762 non-null   object
 5   processed_cmt  4762 non-null   object
dtypes: object(6)
memory usage: 223.3+ KB


In [None]:
df.columns

Index(['character', 'content', 'scene', 'sound', 'Cmt', 'processed_cmt'], dtype='object')

In [None]:
df = df.loc[:, df.columns.notna()]


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)
dev, test = train_test_split(test, test_size=0.5, random_state=42)

In [None]:
path = "/content/drive/MyDrive/ABSA/DataProcessed"

In [None]:
train.to_csv(f'{path}/train_processed.csv', encoding="utf-8-sig", index = False)
dev.to_csv(f'{path}/dev_processed.csv', encoding="utf-8-sig", index = False)
test.to_csv(f'{path}/test_processed.csv', encoding="utf-8-sig", index = False)

In [None]:
train.info()
dev.info()
test.info()

Unnamed: 0,character,content,scene,sound,Cmt,processed_cmt
315,P,X,X,X,Phần 1 lẫn phần 2 phần nào cũng hấp dẫn diễn v...,phần 1 lẫn phần 2 phần_nào cũng hấp_dẫn diễn_v...
3126,P,X,X,X,"Phim hay, ổn, diễn viên đẹp, nói chung hợp gu ...",phim hay ổn diễn_viên đẹp nói_chung hợp_gu nhẹ...
3656,X,X,X,X,"Xuất sắc nha mn ơi, đủ kịch tính, coi mà thấy ...",xuất_sắc nha mn ơi đủ kịch tính coi mà thấy tứ...
3046,X,X,X,X,Phim hay lắm á hài hay kịch tính có đủ hết chỉ...,phim hay lắm á_hài hay kịch_tính có đủ hết chỉ...
2331,X,N,X,X,"đoạn đầu dài lê thê , không giải quyết vấn đề ...",đoạn đầu dài lê_thê không giải_quyết vấn_đề mì...
...,...,...,...,...,...,...
4309,O,X,X,X,Các diễn viên tuy không quá xuất sắc nhưng vẫn...,các diễn_viên tuy không quá xuất_sắc nhưng vẫn...
2577,X,X,X,X,Má ơi dở xĩu,má ơi dở_xĩu
12,X,O,X,X,"Vốn là người không dám xem phim kinh dị, nhưng...",vốn là người không dám xem phim kinh_dị nhưng ...
8,P,P,P,X,"Hic phim coi đúng kiểu dark horror, sợ ghê, cố...",hic phim coi đúng kiểu dark horor sợ ghê_cốt t...


In [None]:
path = "/content/drive/MyDrive/ABSA"
df_train = pd.read_csv(path + "/DataProcessed/train_processed.csv")
df_test = pd.read_csv(path + "/DataProcessed/test_processed.csv")
df_val  = pd.read_csv(path + "/DataProcessed/dev_processed.csv")

In [None]:
import pickle
label_aspect_train = pickle.load(open(f'{path}/Label/label_aspect_train.pkl', 'rb'))


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3806 entries, 0 to 3808
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   character      3806 non-null   object
 1   content        3806 non-null   object
 2   scene          3806 non-null   object
 3   sound          3806 non-null   object
 4   Cmt            3806 non-null   object
 5   processed_cmt  3806 non-null   object
dtypes: object(6)
memory usage: 208.1+ KB


In [None]:
# df_train = df_train.dropna(how='any')
label_aspect_train= label_aspect_train.drop(index = [154, 2113, 3699])

In [None]:
# df_train.to_csv(path + "/DataProcessed/train_processed.csv")
label_sentiment_train.to_pickle(path + "/Label/label_sentiment_train.pkl")

In [None]:
label_sentiment_train

Unnamed: 0,character,content,scene,sound
0,-1,0,-1,2
1,-1,-1,0,-1
2,-1,0,-1,-1
3,-1,-1,2,-1
4,-1,2,0,-1
...,...,...,...,...
3804,-1,-1,1,1
3805,-1,-1,-1,-1
3806,-1,-1,-1,-1
3807,-1,-1,-1,1
