In [35]:
import json

train_data = []
test_data = []
unlabeled_data = []

def read_json(data, json_file):
    with open(json_file, 'r') as f:
        for line in f:
            line = line.strip()  # remove leading/trailing white spaces
            if line:  # ensure the line is not empty
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    
read_json(train_data, 'train_imdb.jsonl')
read_json(test_data, 'test_imdb.jsonl')
read_json(unlabeled_data, 'aug_imdb_unlabeled.jsonl')



In [36]:
import pandas as pd

train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)
unlabeled_data = pd.DataFrame(unlabeled_data)


In [37]:
train_data.head(5)

Unnamed: 0,text,label,embedding
0,fairly good romantic comedy in which i don't t...,1,"[-0.0167805497, -0.0395836979, 0.1233159453, -..."
1,"""dressed to kill"", is one of the best thriller...",1,"[-0.1252697259, 0.1014768854, 0.1718291789, -0..."
2,i'm glad that users (as of this date) who like...,1,"[0.1312361956, 0.0294876788, 0.2328549027, -0...."
3,needed an excuse to get out of the house while...,0,"[0.1387384981, 0.0460377187, 0.3447172046, -0...."
4,john candy's performance in once upon a crime ...,1,"[0.1606466323, -0.1768193543, 0.3563380837, -0..."


In [38]:
test_data.head()

Unnamed: 0,text,label,embedding
0,the 60s (1999) d: mark piznarski. josh hamilto...,0,"[-0.2179879397, -0.1741176099, 0.0884851664, -..."
1,hello. this movie is.......well.......okay. ju...,1,"[-0.0783471093, -0.279764235, 0.6189775467, 0...."
2,eyeliner was worn nearly 6000 years ago in egy...,1,"[0.03139963, -0.1652034372, 0.1265712678, -0.0..."
3,"this has to be, by far, the absolute worst mov...",0,"[-0.0552324504, -0.1593759954, 0.0467776954, -..."
4,"i like silent films, but this was a little too...",0,"[0.0934860557, 0.0262434836, 0.0843501985, -0...."


In [39]:
unlabeled_data.head()

Unnamed: 0,text,embedding
0,there is no relation at all between fortier an...,"[-0.097577557, -0.1536363065, 0.311417222, 0.0..."
1,in the process of trying to establish the audi...,"[-0.0003366936, 0.0877778083, -0.0071643554, 0..."
2,i give this movie 7 out of 10 because the vill...,"[-0.275570631, -0.3291363716, 0.079317905, 0.0..."
3,this is the best sci-fi that i have seen in my...,"[0.1461943835, -0.2785910368, 0.4456491172, -0..."
4,what an appalling piece of rubbish!!! who are ...,"[0.1696606129, 0.354041934, 0.4451519549, -0.0..."


In [40]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    tokens = nltk.word_tokenize(text)
    remove_punct = str.maketrans('', '', string.punctuation)
    tokens = [token.lower().translate(remove_punct) for token in tokens]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
train_data['text'] = train_data['text'].apply(clean_text)
test_data['text'] = test_data['text'].apply(clean_text)
unlabeled_data['text'] = unlabeled_data['text'].apply(clean_text)

In [42]:
train_data.head()

Unnamed: 0,text,label,embedding
0,"[fairli, good, romant, comedi, nt, think, ever...",1,"[-0.0167805497, -0.0395836979, 0.1233159453, -..."
1,"[dress, kill, one, best, thriller, ever, made,...",1,"[-0.1252697259, 0.1014768854, 0.1718291789, -0..."
2,"[glad, user, date, like, movi, come, forward, ...",1,"[0.1312361956, 0.0294876788, 0.2328549027, -0...."
3,"[need, excus, get, hous, paint, dri, left, mov...",0,"[0.1387384981, 0.0460377187, 0.3447172046, -0...."
4,"[john, candi, perform, upon, crime, possibl, b...",1,"[0.1606466323, -0.1768193543, 0.3563380837, -0..."


In [43]:
test_data.head()

Unnamed: 0,text,label,embedding
0,"[60, 1999, mark, piznarski, josh, hamilton, ju...",0,"[-0.2179879397, -0.1741176099, 0.0884851664, -..."
1,"[hello, movi, well, okay, kid, awesom, block, ...",1,"[-0.0783471093, -0.279764235, 0.6189775467, 0...."
2,"[eyelin, worn, nearli, 6000, year, ago, egypt,...",1,"[0.03139963, -0.1652034372, 0.1265712678, -0.0..."
3,"[far, absolut, worst, movi, seen, last, 20, ye...",0,"[-0.0552324504, -0.1593759954, 0.0467776954, -..."
4,"[like, silent, film, littl, moron, much, wish,...",0,"[0.0934860557, 0.0262434836, 0.0843501985, -0...."


In [44]:
unlabeled_data.head()

Unnamed: 0,text,embedding
0,"[relat, fortier, profil, fact, polic, seri, vi...","[-0.097577557, -0.1536363065, 0.311417222, 0.0..."
1,"[process, tri, establish, audienc, empathi, ja...","[-0.0003366936, 0.0877778083, -0.0071643554, 0..."
2,"[give, movi, 7, 10, villain, interest, role, u...","[-0.275570631, -0.3291363716, 0.079317905, 0.0..."
3,"[best, scifi, seen, 29, year, watch, scifi, al...","[0.1461943835, -0.2785910368, 0.4456491172, -0..."
4,"[appal, piec, rubbish, peopl, blubber, good, y...","[0.1696606129, 0.354041934, 0.4451519549, -0.0..."
