## Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import re
import nltk
from nltk.tokenize import word_tokenize
import malaya



## Read csv file

In [2]:
print(pd.options.display.max_rows)
df = pd.read_csv("C:\\Users\\yiesi\\OneDrive\\Desktop\\assignment\\latest.csv")
print(df)

60
                         date  likesCount  \
0    2022-12-16T04:49:26.000Z         209   
1    2022-12-16T02:11:30.000Z         157   
2    2022-12-16T02:08:06.000Z         167   
3    2022-12-16T02:10:05.000Z          66   
4    2022-12-16T06:39:38.000Z          27   
..                        ...         ...   
651  2022-12-17T00:20:44.000Z           0   
652  2022-12-16T04:25:39.000Z           0   
653  2022-12-16T03:46:34.000Z           0   
654  2022-12-16T02:58:52.000Z           1   
655  2022-12-16T03:12:03.000Z           0   

                                       postDescription        postId  \
0    Kenyataan Media\n\nDikejutkan dengan insiden t...  7.300000e+14   
1    Kenyataan Media\n\nDikejutkan dengan insiden t...  7.300000e+14   
2    Kenyataan Media\n\nDikejutkan dengan insiden t...  7.300000e+14   
3    Kenyataan Media\n\nDikejutkan dengan insiden t...  7.300000e+14   
4    Kenyataan Media\n\nDikejutkan dengan insiden t...  7.300000e+14   
..                      

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             656 non-null    object 
 1   likesCount       656 non-null    int64  
 2   postDescription  656 non-null    object 
 3   postId           656 non-null    float64
 4   postTitle        656 non-null    object 
 5   profileName      656 non-null    object 
 6   profilePicture   656 non-null    object 
 7   profileUrl       460 non-null    object 
 8   text             656 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 46.2+ KB


In [4]:
df.isnull().sum()


date                 0
likesCount           0
postDescription      0
postId               0
postTitle            0
profileName          0
profilePicture       0
profileUrl         196
text                 0
dtype: int64

In [5]:
comment = df['text'] 
comment.dropna(inplace=True)

In [6]:
comment.isna().sum()

0

In [7]:
comment.describe

<bound method NDFrame.describe of 0      Ya, kadang-kadang kedatangan menteri-menteri b...
1      Terbaik  Dato sri PM10, menteri jgn sibuk2 nk ...
2                               Terbaik DSAI????????????
3      Terbaik DSAI, semoga kesemua yang terperangkap...
4      Assalamualaikum DSAI yg di sayangi ramai\n\nSe...
                             ...                        
651                           Takda lesen blh beoperasi?
652    Peringatan yang baik untuk menteri-menteri, se...
653    Siapa yg nak pi ganggu semua raayat klu bolih ...
654    Innalillahiwainnailaihiroiiun\nSalam Takziah u...
655    Kasian ini yng kena musibah longsor moga datok...
Name: text, Length: 656, dtype: object>

In [8]:
comment.dtypes

dtype('O')

## Text pre-processing

In [9]:
# lower case
def lower_text(text):
    text = text.lower()
    return text

# removing line breaks
def line_breaks(text):
    text = re.sub(r'\n', ' ', text)
    return text

# removing special characters
def remove_special_characters(text):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,' ',text)
    return text

In [10]:
comment = comment.apply(lower_text)
comment = comment.apply(line_breaks)
comment = comment.apply(remove_special_characters)

In [11]:
comment.head(20)

0     ya  kadang kadang kedatangan menteri menteri b...
1     terbaik  dato sri pm10  menteri jgn sibuk2 nk ...
2                              terbaik dsai            
3     terbaik dsai  semoga kesemua yang terperangkap...
4     assalamualaikum dsai yg di sayangi ramai  semo...
5     semoga misi mencari dipermudahkan  anak buah s...
6           tindakan yg pantas dari pm 10 terbaik dsai 
7     alhamdulillah  bila seorang pemimpin yg berjiw...
8     musim tengkujuh ni  kawasan camping mcm ni san...
9     innalillahiwainnailaihiroiiun salam takziah un...
10    sedikit pun aku tak rasa menyesal undi ph pada...
11    mohon pm dsai keluarkan pekeliling musim hujan...
12    anwar ibrahim pm yg sgt perihatin terbaik dsai...
13    marilah semua tak kira latarbelakang kepercaya...
14                         terbaik dsai bertindak laju 
15    semoga tuhan mempermudah kerja kerja pencahari...
16    laju pihak perdana menteri dan kabinet respond   
17    sebaiknya nasihatkan menteri2 berkaitan ha

In [12]:
type(comment)

pandas.core.series.Series

In [13]:
comment[40]

'sir have you visited the disaster site  we understand your update and can see your are working hard  perhaps pause your posts for a day to mourn for this horrible disaster  no offense sir  we respect you and admire your leadership however this is important '

## Translation

In [14]:
from googletrans import Translator
translator = Translator()


In [15]:
comment = [translator.translate(c, dest="ms") for c in comment]

In [16]:
comment = [comment[i].text for i in range(len(comment))]

In [17]:
comment = pd.Series(comment)

In [18]:
comment[40]

'tuan adakah anda melawat tapak bencana kami memahami kemas kini anda dan dapat melihat anda bekerja keras mungkin jeda siaran anda selama sehari untuk berkabung atas bencana yang mengerikan ini tiada kesalahan tuan kami menghormati anda dan mengagumi kepimpinan anda namun ini penting'

## Tokenization

In [19]:
nltk.word_tokenize(comment[0])

['ya',
 'kadang',
 'kadang',
 'kedatangan',
 'menteri',
 'menteri',
 'bukannya',
 'membantu',
 'malah',
 'hanya',
 'menyusahkan',
 'petugas',
 'dan',
 'mengganggu',
 'kerja',
 'yang',
 'sedang',
 'giat',
 'dilaksanakan',
 'kita',
 'juga',
 'tidak',
 'lupa',
 'beberapa',
 'orang',
 'menteri',
 'yang',
 'berlakon',
 'buat',
 'kerja',
 'masa',
 'banjir',
 'di',
 'taman',
 'sri',
 'muda',
 'dan',
 'hulu',
 'langat',
 'dulu']

In [20]:
tokens = comment.apply(word_tokenize)
print(tokens)

0      [ya, kadang, kadang, kedatangan, menteri, ment...
1      [terbaik, dato, sri, pm10, menteri, jgn, sibuk...
2                                        [terbaik, dsai]
3      [terbaik, dsai, semoga, kesemua, yang, terpera...
4      [assalamualaikum, dsai, yg, di, sayangi, ramai...
                             ...                        
651                       [takda, lesen, blh, beoperasi]
652    [peringatan, yang, baik, untuk, menteri, mente...
653    [siapa, yg, nak, pi, ganggu, semua, raayat, kl...
654    [innalillahiwainnailaihiroiiun, salam, takziah...
655    [kasian, ini, yng, kena, musibah, longsor, mog...
Length: 656, dtype: object


## Malay Lemmatization

In [21]:
sastrawi = malaya.stem.sastrawi()
texts = [" ".join(token) for token in tokens]

In [22]:
sentence = [sastrawi.stem(comments) for comments in texts]
sentence = pd.Series(sentence)
print(sentence)

0      ya kadang kadang datang menteri menteri bukan ...
1      baik dato sri pm10 menteri jgn sibuk2 nk minta...
2                                              baik dsai
3      baik dsai moga semua yang perangkap dapat selamat
4      assalamualaikum dsai yg di sayang ramai moga a...
                             ...                        
651                            takda lesen blh beoperasi
652    ingat yang baik untuk menteri menteri sebab be...
653    siapa yg nak pi ganggu semua raayat klu bolih ...
654    innalillahiwainnailaihiroiiun salam takziah un...
655    kasi ini yng kena musibah longsor moga datok a...
Length: 656, dtype: object


## Rojak Lemmatization

In [23]:
sentence_tok = sentence.apply(word_tokenize)
sentence_tok.head(20)

0     [ya, kadang, kadang, datang, menteri, menteri,...
1     [baik, dato, sri, pm10, menteri, jgn, sibuk2, ...
2                                          [baik, dsai]
3     [baik, dsai, moga, semua, yang, perangkap, dap...
4     [assalamualaikum, dsai, yg, di, sayang, ramai,...
5     [moga, misi, cari, mudah, anak, buah, saya, do...
6        [tindak, yg, pantas, dari, pm, 10, baik, dsai]
7     [alhamdulillah, bila, orang, pimpin, yg, jiwa,...
8     [musim, tengkujuh, ni, kawasan, camping, mcm, ...
9     [innalillahiwainnailaihiroiiun, salam, takziah...
10    [sedikit, pun, aku, tak, rasa, sesal, undi, ph...
11    [mohon, pm, dsai, keluar, keliling, musim, huj...
12    [anwar, ibrahim, pm, yg, sgt, perihatin, baik,...
13    [mari, semua, tak, kira, latarbelakang, percay...
14                           [baik, dsai, tindak, laju]
15     [moga, tuhan, mudah, kerja, kerja, cahari, amin]
16    [laju, pihak, perdana, menteri, dan, kabinet, ...
17    [baik, nasihat, menteri2, kait, hanya, mon

In [24]:
malaydict = {
    "2" : "itu",
    "6be" : "nombor",
    "abe" : "habis",
    "abes" : "habis",
    "abg" : "abang",
    "abih" : "habis",
    "ade" : "ada",
    "adek" : "adik",
    "adeq" : "adik",
    "ader" : "ada",
    "adk" : "adik",
    "adlh" : "adalah",
    "ado" : "ada",
    "agensi2" : "agensi-agensi",
    "agk" : "agak",
    "agr" : "agar",
    "aje" : "saja",
    "ajer" : "saja",
    "ak" : "aku",
    "ako" : "aku",
    "amat2" : "amat-amat",
    "ambik" : "ambil",
    "amek" : "ambil",
    "amt" : "amat",
    "ank" : "anak",
    "anggota2" : "anggota-anggota",
    "apa2": "apa-apa",
    "ape" : "apa",
    "aper" : "apa",
    "apehal" : "apa hal",
    "aq" : "aku",
    "arap" : "harap",
    "ari" : "hari",
    "aritu" : "hari itu",
    "ats" : "atas",
    "bagitau" : "bagitahu",
    "baik2" : "baik-baik",
    "baper" : "berapa",
    "bca" : "baca",
    "bleh" : "boleh",
    "blek" : "balik",
    "bende" : "benda",
    "berhrp" : "berharap",
    "bg" : "bagi",
    "bgi" : "bagi",
    "bgitu" : "begitu",
    "bgtau" : "bagitahu",
    "bior" : "biar",
    "biorlh" : "biarlah",
    "bkn" : "bukan",
    "blah" : "belah",
    "blh" : "boleh",
    "bljr" : "belajar",
    "blk" : "belakang",
    "blkng" : "belakang",
    "blker" : "belakang",
    "bls" : "balas",
    "bndingkn" : "bandingkan",
    "bnyak" : "bakyak",
    "biase" : "biasa",
    "bodo" : "bodoh",
    "bole" : "boleh",
    "bolih" : "boleh",
    "brader" : "abang",
    "brduka" : "berduka",
    "brjln" : "berjalan",
    "bjya" : "berjaya",
    "brlaku" : "berlaku",
    "brg" : "barang",
    "brp" : "berapa",
    "brulang" : "berulang",
    "btl" : "betul",
    "budak2" : "budak-budak",
    "bukit2" : "bukit-bukit",
    "bwh" : "bawah",
    "byk" : "banyak",
    "byk2" : "banyank-banyak",
    "byr" : "bayar",
    "camna" : "macam mana",
    "camne" : "macam mana",
    "camner" : "macam mana",
    "camni" : "macam ini",
    "camtu" : "macam itu",
    "cbuk" : "sibuk",
    "chat" : "sihat",
    "ckap" : "cakap",
    "ckp" : "cakap",
    "cm" : "macam",
    "cmana" : "macam mana",
    "cmane" : "macam mana",
    "cmburu" : "cemburu",
    "cmna" : "macam mana",
    "cmne" : "macam mana",
    "cmner" : "macam mana",
    "cmpur" : "campur",
    "cni" : "sini",
    "cpur" : "campur",
    "cpt" : "cepat",
    "c2" : "situ",
    "ctu" : "situ",
    "cukuplh" : "cukuplah",
    "d" : "di",
    "dea" : "dia",
    "dear" : "dia",
    "deorg" : "dia orang",
    "deorng" : "dia orang",
    "dgar" : "dengar",
    "dgn" : "dengan",
    "dgr" : "dengar",
    "dh" : "dah",
    "diaorg" : "dia orang",
    "diaorng" : "dia orang",
    "didlm" : "di dalam",
    "die" : "dia",
    "dier" : "dia",
    "dikluarkan" : "dikeluarkan",
    "diprmudahkn" : "dipermudahkan",
    "dk" : "dekat",
    "dkat" : "dekat",
    "dkt" : "dekat",
    "dl" : "dulu",
    "dlakukn" : "dilakukan",
    "dlm" : "dalam",
    "dlu" : "dulu",
    "dn" : "dan",
    "dngn" : "dengan",
    "doakn" : "doakan",
    "dok" : "duduk",
    "dorg" : "dia orang",
    "dpd" : "daripada",
    "doc" : "doktor",
    "dpmudahkn" : "dapat mudahkan",
    "dprmudhkn" : "dipermudahkan",
    "dpt" : "dapat",
    "dr" : "dari",
    "drnya" : "darinya",
    "drp" : "daripada",
    "ds" : "dato seri",
    "dsai" : "dato' seri anwar bin ibrahim",
    "dseri" : "dato seri",
    "dslmtkn" : "diselamatkan",
    "dtg" : "datang",
    "duk" : "duduk",
    "dun" : "dunia",
    "elkn" : "elokan",
    "eloklh" : "eloklah",
    "fhm" : "faham",
    "fhmx" : "faham tak",
    "fkr" : "fikir",
    "fmly" : "keluarga",
    "g" : "pergi",
    "gak" : "juga",
    "gi" : "pergi",
    "gile" : "gila",
    "giler" : "gila",
    "gini" : "begini",
    "gitu" : "begitu",
    "gna" : "guna",
    "gne" : "guna",
    "hado" : "ada",
    "hamba2" : "hamba-hamba",
    "hbgn" : "hubungan",
    "hepi" : "gembira",
    "hjg" : "hujung",
    "hjn" : "hujan",
    "hjn2" : "hujan-hujan",
    "hosp" : "hospital",
    "hrap" : "harap",
    "hrga" : "harga",
    "hri" : "hari",
    "hrp" : "harap",
    "hutang2" : "hutang-hutang",
    "hutg" : "hutang",
    "hutng" : "hutang",
    "igt" : "ingat",
    "jap" : "kejap",
    "jbt" : "jabatan",
    "je" : "saja",
    "jela" : "saja lah",
    "jelh" : "saja lah",
    "jer" : "saja",
    "jelah" : "saja lah",
    "jeles" : "cemburu",
    "jls" : "jelas",
    "jerlh" : "saja lah",
    "jg" : "juga",
    "jga" : "juga",
    "jgk" : "juga",
    "jgn" : "jangan",
    "jgnla" : "janganlah",
    "jgnlh" : "janganlah",
    "jnis" : "jenis",
    "jnji" : "janji",
    "jls" : "jelas",
    "jilak" : "jelak",
    "jln" : "jalan",
    "jls" : "jelas",
    "jugak" : "juga",
    "jwtn" : "jawatan",
    "k" : "ok",
    "kabe" : "kabar",
    "kaber" : "kabar",
    "kabo" : "kabar",
    "kabor" : "kabar",
    "kalo" : "kalau",
    "kat" : "dekat",
    "kawasan2" : "kawasan-kawasan",
    "kaya2" : "kaya-kaya",
    "kcik" : "kecik",
    "kcuali" : "kecuali",
    "kdg" : "kadang",
    "kdg2" : "kadang-kadang",
    "keja" : "kerja",
    "keje" : "kerja",
    "kejer" : "kerja",
    "kelrga" : "keluarga",
    "kene" : "kena",
    "kepd" : "kepada",
    "kerja2" : "karja-kerja",
    "kja" : "kerja",
    "kjaan" : "kerjaan",
    "kje" : "kerja",
    "kl" : "kalau",
    "klau" : "kalau",
    "klga" : "keluarga",
    "klrga" : "keluarga",
    "klu" : "kalau",
    "kluarga" : "keluarga",
    "kmi" : "kami",
    "kn" : "kan",
    "knape" : "kenapa",
    "knp" : "kenapa",
    "ko" : "kau",
    "komen2": "komen-komen",
    "korg" : "kamu orang",
    "korng" : "kamu orang",
    "kpd" : "kepada",
    "kpda" : "kepada",
    "kpde" : "kepada",
    "krja" : "krja",
    "ksh" : "kasih",
    "ksut" : "kasut",
    "kt" : "dekat",
    "ku" : "aku",
    "kuatkn" : "kuatkan",
    "kwn" : "kawan",
    "kwsan" : "kawasan",
    "kwsn" : "kawasan",
    "kwsn2" : "kawasan-kawasan",
    "la" : "lah",
    "laa" : "lah",
    "lg" : "lagi",
    "lgi" : "lagi",
    "lh" : "lah",
    "lma" : "lama",
    "lmbat" : "lambat",
    "lmbt" : "lambat",
    "lme" : "lama",
    "m" : "masih",
    "mangsa2" : "mangsa-mangsa",
    "memg" : "memang",
    "mcari" : "mencari",
    "mcm" : "macam",
    "mcmne" : "macam mana",
    "mcmni" : "macam ini",
    "mcmtu" : "macam itu",
    "mcmtue" : "macam itu",
    "menjengok2" : "menjengok-jengok",
    "menjlnkn" : "menjalankan",
    "menteri2" : "menteri-menteri",
    "menyelmt" : "menyelamat",
    "mggu" : "minggu",
    "mgsa" : "mangsa",
    "mkn" : "makan",
    "mksd" : "maksud",
    "mintak" : "minta",
    "mjlnkan" : "menjalankan",
    "mmang" : "memang",
    "mmg" : "memang",
    "mmpu" : "mampu",
    "mn" : "men",
    "mndtgkn" : "mendatangkan",
    "mne" : "mana",
    "mngsa" : "mangsa",
    "mngsa2" : "mangsa-mangsa",
    "mnjaga" : "menjaga",
    "mnta" : "minta",
    "mntalh" : "mintalah",
    "mnteri" : "menteri",
    "mntri" : "menteri",
    "mnyelamat" : "menyelamat",
    "mnyelamatkn" : "menyelamatkan",
    "moga" : "semoga",
    "mreka" : "mereka",
    "mrk" : "mereka",
    "ms" : "masa",
    "msa" : "masa",
    "mse" : "masa",
    "msh" : "masih",
    "msj" : "mesej",
    "mslh" : "masalah",
    "msti" : "mesti",
    "mu" : "kamu",
    "mudah2" : "mudah-mudah",
    "muge" : "semoga",
    "muke" : "muka",
    "musim2" : "musim-musim",
    "n": "dan",
    "nape" : "kenapa",
    "ne" : "mana",
    "ngan" : "dengan",
    "ngn" : "dengan",
    "nk" : "nak",
    "nmpak" : "nampak",
    "nmpk" : "nampak",
    "nnt" : "nanti",
    "nnti" : "nanti",
    "nntilh" : "nantilah",
    "ni" : "ini",
    "nie" : "ini",
    "nii" : "ini",
    "nti" : "nanti",
    "no" : "nombor",
    "nye" : "nya",
    "nyibuk" : "menyibuk",
    "idup" : "hidup",
    "olih" : "oleh",
    "org" : "orang",
    "org2" : "orang-orang",
    "pd" : "pada",
    "pda" : "pada",
    "pdhal" : "pada hal",
    "pe" : "apa",
    "perda" : "perdana",
    "pegi" : "pergi",
    "petugas2" : "petugas-petugas",
    "pg" : "pagi",
    "pgi" : "pagi",
    "pggl" : "panggil",
    "ph" : "pakatan harapan",
    "pk" : "fikir",
    "pkai" : "pakai",
    "pm" : "perdana menteri",
    "pi" : "pergi",
    "pihk" : "pihak",
    "pjg" : "panjang",
    "plan2" : "pelan-pelan",
    "plg" : "paling",
    "pmx" : "perdana menteri ke-10",
    "pm10" : "perdana menteri ke-10",
    "pn" : "pun",
    "pndng" : "pandang",
    "pndngn" : "pandangan",
    "pnjg" : "panjang",
    "pokok2" : "pokok-pokok",
    "polisi2" : "polisi-polisi",
    "pon" : "pun",
    "pra" : "professional footballers association of malaysia",
    "prmudoh" : "permudah",
    "pntau" : "pantau",
    "prnah" : "pernah",
    "prnh" : "pernah",
    "projek2" : "projek-projek",
    "pru15" : "pilihan raya umum malaysia ke-15",
    "pulak" : "pula",
    "pusat2" : "pusat-pusat",
    "pstikn" : "pastikan",
    "pstu" : "pastu",
    "ptg" : "petang",
    "ptugas" : "petugas",
    "pyh" : "payah",
    "ramai2" : "ramai-ramai",
    "retus" : "artis",
    "rkn" : "rakan",
    "rkyat" : "rakyat",
    "rkyt" : "rakyat",
    "rmai" : "ramai",
    "rmh" : "rumah",
    "rs" : "rasa",
    "rsa" : "rasa",
    "rse" : "rasa",
    "sado" : "besar",
    "salm" : "salam",
    "sama2" : "sama-sama",
    "sape" : "siapa",
    "sbr" : "sabar",
    "sdr" : "sedar",
    "sebok" : "sebuk",
    "sekurang2nya" : "sekurang-kurangnya",
    "selmt" : "selamat",
    "sbb" : "sebab",
    "sblum" : "sebelum",
    "sdg" : "sedang",
    "sebaik2nya" : "sebaik-baiknya",
    "semuga" : "semoga",
    "sepnjng" : "sepanjang",
    "sgala" : "segala",
    "sgla" : "segala",
    "sgt" : "sangat",
    "sgt2" : "sangat-sangat",
    "shbt" : "sahabat",
    "shj" : "sahaja",
    "skg" : "sekarang",
    "skolah" : "sekolah",
    "skong" : "sokong",
    "skrg" : "sekarang",
    "siap2" : "siap-siap",
    "sibuk2" : "sibuk-sibuk",
    "slamat" : "selamat",
    "slh" : "salah",
    "slm" : "salam",
    "smg" : "semoga",
    "smoga" : "semoga",
    "smpai" : "sampai",
    "smpi" : "sampai",
    "smua" : "semua",
    "sna" : "sana",
    "sngt" : "sangat",
    "sni" : "sini",
    "sorg" : "seorang",
    "spital" : "hospital",
    "spt" : "seperti",
    "stiap" : "setiap",
    "stu" : "situ",
    "swt" : "subhanahu wa ta'ala",
    "sy" : "saya",
    "sya" : "saya",
    "syng" : "sayang",
    "t" : "nanti",
    "takda" : "tak ada",
    "takde" : "tak ada",
    "taknk" : "tak nak",
    "tanah2" : "tanah-tanah",
    "tau" : "tahu",
    "tautk" : "tahu tak",
    "taux" : "tahu tak",
    "taw" : "tau",
    "tawu" : "tau",
    "tbaik" : "terbaik",
    "td" : "tadi",
    "tdk" : "tidak",
    "tdo" : "tidur",
    "tersyg" : "tersayang",
    "tggi" : "tinggi",
    "tgk" : "tengok",
    "thn" : "tahun",
    "tiap" : "setiap",
    "tido" : "tidur",
    "tk" : "tak",
    "tknk" : "tak nak",
    "tksh" : "terima kasih",
    "tksih" : "terima kasih",
    "tkut" : "takut",
    "tlibat" : "terlibat",
    "tmpt" : "tempat",
    "tmpt2" : "tempat-tempat",
    "tnda" : "tanda",
    "tnda2" : "tanda-tanda",
    "tnh" : "tanah",
    "tntu" : "tentu",
    "ttg" : "tentang",
    "ttu" : "tentu",
    "tu" : "itu",
    "tugas2" : "tugas-tugas",
    "tuk" : "untuk",
    "tuu" : "itu",
    "tpi" : "tapi",
    "tpt" : "tetapi",
    "tq" : "terima kasih",
    "tr" : "ter",
    "trbaik" : "terbaik",
    "trhdp" : "terhadap",
    "trlibt" : "terlibat",
    "trlps" : "terlepas",
    "trm" : "terima",
    "trma" : "terima",
    "trus" : "terus",
    "udah" : "sudah",
    "ujan" : "hujan",
    "urusn" : "urusan",
    "utk" : "untuk",
    "vip2" : "vip-vip",
    "waris2" : "waris-waris",
    "wlpn" : "walaupun",
    "wlpun" : "walaupun",
    "wpun" : "walaupun",
    "wslm" : "wa’alaikumsalam",
    "x" : "tak",
    "xberapa" : "tak berapa",
    "xberbau" : "tak berbau",
    "xbrp" : "tak berapa",
    "xkn" : "tak kan",
    "xkan" : "tak kan",
    "xle" : "tak boleh",
    "xleh" : "tak boleh",
    "xmau" : "tak mahu",
    "xmenentu" : "tak menentu",
    "xnk" : "tak nak",
    "xperlu" : "tak perlu",
    "xprlu" : "tak perlu",
    "xya" : "tak payah",
    "xyah" : "tak payah",
    "y" : "yang",
    "yab" : "yang amat berhormat",
    "yg" : "yang",
    "yng" : "yang",
    "yra" : "ya rabbal alamin"

}


In [25]:
for s in sentence_tok:
    for i in range(len(s)):
        for key, value in malaydict.items():
            if s[i] == key:
                s[i] = value
            else:
                s[i] = s[i]

print(sentence_tok)

0      [ya, kadang, kadang, datang, menteri, menteri,...
1      [baik, dato, sri, perdana menteri ke-10, mente...
2                   [baik, dato' seri anwar bin ibrahim]
3      [baik, dato' seri anwar bin ibrahim, semoga, s...
4      [assalamualaikum, dato' seri anwar bin ibrahim...
                             ...                        
651                   [tak ada, lesen, boleh, beoperasi]
652    [ingat, yang, baik, untuk, menteri, menteri, s...
653    [siapa, yang, nak, pergi, ganggu, semua, raaya...
654    [innalillahiwainnailaihiroiiun, salam, takziah...
655    [kasi, ini, yang, kena, musibah, longsor, semo...
Length: 656, dtype: object


In [26]:
lem_sen = [" ".join(token) for token in sentence_tok]
lem_sen = pd.Series(lem_sen)

In [27]:
print(lem_sen)

0      ya kadang kadang datang menteri menteri bukan ...
1      baik dato sri perdana menteri ke-10 menteri ja...
2                      baik dato' seri anwar bin ibrahim
3      baik dato' seri anwar bin ibrahim semoga semua...
4      assalamualaikum dato' seri anwar bin ibrahim y...
                             ...                        
651                        tak ada lesen boleh beoperasi
652    ingat yang baik untuk menteri menteri sebab be...
653    siapa yang nak pergi ganggu semua raayat kalau...
654    innalillahiwainnailaihiroiiun salam takziah un...
655    kasi ini yang kena musibah longsor semoga dato...
Length: 656, dtype: object


## Application / User Input

In [29]:
user_sen = input("Enter a sentence to lemmatize it: ")

user_token = nltk.word_tokenize(user_sen)
user_sen = [translator.translate(t, dest="ms") for t in user_token]
translated = [user_sen[i].text for i in range(len(user_sen))]
translated = " ".join(translated) 
user_token = nltk.word_tokenize(translated)

for i in range(len(user_token)):
    for key, value in malaydict.items():
        if user_token[i] == key:
            user_token[i] = value

user_sen = " ".join(user_token)            
print(user_sen)


Enter a sentence to lemmatize it: 哇塞，kamera ni mmg best
wow , kamera ini memang terbaik
