In [1]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2024.4.16-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (777 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.0/777.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tqdm
  Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting click
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm, regex, click, nltk
Successfully installed click-8.1.7 nltk-3.8.1 regex-2024.4.16 tqdm-4.66.2
[0m
[1m[[

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [4]:
train_df.head(10)

Unnamed: 0,ID,TEXT,LABEL
0,7850790573542594519,If you love good films don't ever buy this pei...,2
1,9392069522632994700,The 33 percent of the nations nitwits that sti...,2
2,5083704536542443514,I saw Anatomy years ago -- dubbed at a friends...,1
3,12418349755186772171,Dark Remains is a home run plain and simple. T...,1
4,12144957944004619479,Feh. This movie started out in an interesting ...,2
5,149196648365871032,I paid something like 89 cents for this book t...,0
6,795160300776864438,Series 2 has got off to a great start! I don't...,1
7,1683320143918265581,I admit I bought this book and a few other off...,0
8,16870111134185498931,My son loves Star Wars...I do not...but I do l...,1
9,4994537575152965727,This noir may not be the best remembered film ...,1


In [5]:
test_df.shape

(17580, 2)

In [6]:
train_df.shape

(70317, 3)

In [7]:
test_df.head()

Unnamed: 0,ID,TEXT
0,4728459160322025755,An excellent debut movie for the the director ...
1,1840432070229003467,If you have a preschooler or remember how stre...
2,12623336783082722606,What should have been a routine babysitting gi...
3,7446733850828603409,Cute but
4,16180660281866613068,"Elvis Presley plays a ""half-breed"" Native Amer..."


In [8]:
train_df.isna().sum()

ID       0
TEXT     6
LABEL    0
dtype: int64

In [9]:
train_empty_values_per_row = train_df.isna().sum(axis=1)
empty_rows = train_empty_values_per_row[train_empty_values_per_row > 0].index
print("Rows with empty values:")
print(empty_rows)

Rows with empty values:
Int64Index([1948, 12630, 20910, 48295, 56333, 67550], dtype='int64')


In [10]:
test_df.isna().sum()

ID      0
TEXT    1
dtype: int64

In [11]:
empty_row_indices = [1948, 12630, 20910, 48295, 56333, 67550]
empty_rows = train_df.iloc[empty_row_indices]
print(empty_rows)

                         ID TEXT  LABEL
1948   13508688890785489498  NaN      0
12630  17294071000633575886  NaN      0
20910  13709585650072749007  NaN      0
48295  15956250804379203815  NaN      0
56333    856590162040554499  NaN      0
67550  14966087743691161674  NaN      0


In [12]:
train_df['TEXT'].fillna('no review yet', inplace=True)

In [13]:
train_df.isna().sum()

ID       0
TEXT     0
LABEL    0
dtype: int64

In [14]:
train_df.iloc[1948]

ID       13508688890785489498
TEXT            no review yet
LABEL                       0
Name: 1948, dtype: object

In [15]:
train_df.iloc[12630]

ID       17294071000633575886
TEXT            no review yet
LABEL                       0
Name: 12630, dtype: object

In [16]:
test_df['TEXT'].fillna('no review yet', inplace=True)

In [17]:
test_df.isna().sum()

ID      0
TEXT    0
dtype: int64

In [18]:
label_mapping = {
    0: 'Not a movie or TV show review',
    1: 'A positive movie or TV show review',
    2: 'A negative movie or TV show review'
}
train_df['Label_desc'] = train_df['LABEL'].map(label_mapping)

In [19]:
train_df[['LABEL', 'Label_desc']].head(10)

Unnamed: 0,LABEL,Label_desc
0,2,A negative movie or TV show review
1,2,A negative movie or TV show review
2,1,A positive movie or TV show review
3,1,A positive movie or TV show review
4,2,A negative movie or TV show review
5,0,Not a movie or TV show review
6,1,A positive movie or TV show review
7,0,Not a movie or TV show review
8,1,A positive movie or TV show review
9,1,A positive movie or TV show review


In [20]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_text

In [21]:
train_df['TEXT'] = train_df['TEXT'].apply(lemmatize_text)
test_df['TEXT'] = test_df['TEXT'].apply(lemmatize_text)

In [22]:
sentiment_lexicon = {
    'like': 2,
    'awesome':3,
}

In [23]:
def calculate_sentiment_score(text):
    words = text.lower().split()
    sentiment_score = 0
    negation = False
    for word in words:
        if word == 'not':
            negation = True
        elif word in sentiment_lexicon:
            if negation:
                sentiment_score -= sentiment_lexicon[word]
                negation = False
            else:
                sentiment_score += sentiment_lexicon[word]
    return sentiment_score

train_df['Sentiment_Score'] = train_df['TEXT'].apply(calculate_sentiment_score)
test_df['Sentiment_Score'] = test_df['TEXT'].apply(calculate_sentiment_score)

In [24]:
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2))
X_train = tfidf_vectorizer.fit_transform(train_df['TEXT'])
y_train = train_df['LABEL']
X_test = tfidf_vectorizer.transform(test_df['TEXT'])

In [25]:
model = LinearSVC()
model.fit(X_train, y_train)

LinearSVC()

In [26]:
test_pred = model.predict(X_test)
test_df['LABEL'] = test_pred

In [27]:
submission_df = pd.DataFrame({'ID': test_df['ID'], 'LABEL': test_pred})
submission_df.to_csv('submission.csv', index=False)