In [None]:
# !pip install langdetect

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split


In [2]:
# Reading data
def read_csv(file_path):
    text_data = pd.read_csv(file_path, sep=",")
    return text_data

train_path = "./data/train.csv"
test_path = "./data/test.csv"

train_data = read_csv(train_path)
test_data = read_csv(test_path)

In [3]:
print(train_data)

                         ID  \
0       7850790573542594519   
1       9392069522632994700   
2       5083704536542443514   
3      12418349755186772171   
4      12144957944004619479   
...                     ...   
70312   4533411613007495120   
70313   2075555016956547354   
70314   5446360756235190232   
70315  11502529898454172361   
70316  11352924827579021872   

                                                    TEXT  LABEL  
0      If you love good films don't ever buy this pei...      2  
1      The 33 percent of the nations nitwits that sti...      2  
2      I saw Anatomy years ago -- dubbed at a friends...      1  
3      Dark Remains is a home run plain and simple. T...      1  
4      Feh. This movie started out in an interesting ...      2  
...                                                  ...    ...  
70312  I grew up looking for fairies in my backyard, ...      0  
70313  Jeff King,a guy I know from work,LOVES this Mo...      2  
70314  I love the way Inder write

In [4]:
print(test_data)

                         ID                                               TEXT
0       4728459160322025755  An excellent debut movie for the the director ...
1       1840432070229003467  If you have a preschooler or remember how stre...
2      12623336783082722606  What should have been a routine babysitting gi...
3       7446733850828603409                                           Cute but
4      16180660281866613068  Elvis Presley plays a "half-breed" Native Amer...
...                     ...                                                ...
17575  15460118162570972562  I hardly ever write reviews here, but when thi...
17576   2679547768967862209  　　\n\n三藏聞言，頂禮不盡。教：「徒弟們，收拾去罷。」那沙僧即在裏面尋了些米 糧，安排了...
17577   2966026531113989116  Another winner by Faith Hunter....It's a wild,...
17578  10698695044532313190  My students of all ages can't get enough of ac...
17579   8504104014180128164  ...On stage, TV or in a book, 'The Woman in Bl...

[17580 rows x 2 columns]


In [5]:
# Remove rows with NaN values in the 'TEXT' column
train_data = train_data.dropna(subset=['TEXT'])
# test_data = test_data['TEXT'].fillna('')

In [6]:
# Print general information about the DataFrame
print(test_data.info())  

# Print the data types of all columns
print(test_data.dtypes) 

# # Print unique values in the 'LABEL' column
# print(test_data['LABEL'].unique()) 

# Check for missing values
print(test_data.isna().sum()) 

<class 'pandas.core.series.Series'>
RangeIndex: 17580 entries, 0 to 17579
Series name: TEXT
Non-Null Count  Dtype 
--------------  ----- 
17580 non-null  object
dtypes: object(1)
memory usage: 137.5+ KB
None
object
0


In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_data['TEXT'])
y_train = train_data['LABEL']
X_test = vectorizer.transform(test_data['TEXT'])

In [None]:
# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Evaluate the model on the validation set
y_pred_val = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_f1 = f1_score(y_val, y_pred_val, average='macro')
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation F1-Score: {val_f1:.4f}")

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
submission_df = pd.DataFrame({'ID': test_data['ID'], 'LABEL': y_pred})

In [None]:
# # Check if the submission dataframe has exactly 17580 rows
# if len(submission_df) != 17580:
#     print("Error: The submission dataframe does not have exactly 17580 rows.")
# else:
#     # Save the submission dataframe as a CSV file
#     submission_df.to_csv('./data/submission/submission.csv', index=False)
#     print("Submission file saved successfully.")
# Create submission file without 'LABEL' column (if submission is still expected)
submission_df = pd.DataFrame({'ID': test_data['ID'], 'LABEL': y_pred})
submission_df.to_csv('./data/submission/submission.csv', index=False)

In [None]:
submission_df

In [None]:
# Print general information about the DataFrame
print(submission_df.info())  

# Print the data types of all columns
print(submission_df.dtypes) 

# Print unique values in the 'LABEL' column
print(submission_df['LABEL'].unique()) 

# Check for missing values
print(submission_df.isna().sum()) 

In [None]:
# # Text preprocessing function
# def preprocess_text(text):
#     processed_text = text.lower()
#     processed_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))', '<URL>', processed_text)
#     processed_text = re.sub(r'\d+', '<PHONE>', processed_text)
#     processed_text = re.sub(r'[^a-zA-Z\s]', '', processed_text)
#     processed_text = re.sub(r'[^\w\s]', '<PUNCT>', processed_text)
#     processed_text = re.sub(r'\b\w\b', '<SNGL>', processed_text)
#     processed_text = re.sub(r'\s+', '<SPC>', processed_text).strip()
    
#     return processed_text

# # Preprocess text data
# train_data['TEXT'] = train_data['TEXT'].astype('str').apply(preprocess_text)
# test_data['TEXT'] = test_data['TEXT'].astype('str').apply(preprocess_text)