## Imports

In [1]:
from typing import Tuple, List

import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Reading Datasets

In [2]:
def read_csv(filename: str) -> pd.DataFrame:
    return pd.read_csv(filename, sep=',', encoding='latin-1')

train_and_val_data = read_csv('data/ds4420_kaggle_train_data.csv')
test_data = read_csv('data/ds4420_kaggle_test_data.csv')

def split_label(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return data.drop('Label', axis=1), data['Label']

X_train_and_val, y_train_and_val = split_label(train_and_val_data)
X_test = test_data

## Splitting Into Training and Validation Sets

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size=0.2, random_state=42)

X_train.head(3)

Unnamed: 0,ID,Text,Selected_Text,Selected_Text_Sentiment,Time_of_Post,User_Age,Population,Land_Size,Population_Density
13294,13294,In a period of exams days are for study and ni...,no funny time..,negative,7,10,102334404,995450,103
17124,17124,So bored.... couldn`t go to a-kon...,bored..,negative,22,37,33469203,425400,79
12560,12560,its nice to have no assignments for the night,nice,positive,17,69,26378274,318000,83


## TF-IDF Vectorizer & Logistic Regression

In [4]:
# vectorizer = TfidfVectorizer()
# X_train_tfidf_vec = vectorizer.fit_transform(X_train['Text'])
# X_val_tfidf_vec = vectorizer.transform(X_val['Text'])

# model = LogisticRegression(random_state=1).fit(X_train_tfidf_vec, y_train)
# y_train_pred = model.predict(X_train_tfidf_vec)
# y_val_pred = model.predict(X_val_tfidf_vec)

# print(classification_report(y_val, y_val_pred))

# test_data_exportable = test_data.copy()
# X_test_vec = vectorizer.transform(X_test['Text'])
# y_test_pred = model.predict(X_test_vec)
# test_data_exportable['Label'] = y_test_pred
# test_data_exportable = test_data_exportable[['ID', 'Label']]
# test_data_exportable.to_csv('model1.csv', index=False)

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      3261
           1       0.85      0.60      0.70      1476

    accuracy                           0.84      4737
   macro avg       0.84      0.77      0.80      4737
weighted avg       0.84      0.84      0.83      4737



In [5]:
# def clean_text(text: str) -> List[str]:
#     text = text.lower()
#     text = re.sub(r'[^\w\s]', '', text)
#     return text

# X_train_features = X_train.copy()
# X_val_features = X_val.copy()

# X_train_features['Text'] = X_train_features['Text'].apply(clean_text)
# X_val_features['Text'] = X_val_features['Text'].apply(clean_text)

# vectorizer = TfidfVectorizer(stop_words='english')
# X_train_tfidf_vec = vectorizer.fit_transform(X_train_features['Text'])
# X_val_tfidf_vec = vectorizer.transform(X_val_features['Text'])

# model = LogisticRegression(random_state=1).fit(X_train_tfidf_vec, y_train)
# y_train_pred = model.predict(X_train_tfidf_vec)
# y_val_pred = model.predict(X_val_tfidf_vec)

# print(classification_report(y_val, y_val_pred))

# test_data_exportable = test_data.copy()
# X_test_vec = vectorizer.transform(X_test['Text'])
# y_test_pred = model.predict(X_test_vec)
# test_data_exportable['Label'] = y_test_pred
# test_data_exportable = test_data_exportable[['ID', 'Label']]
# test_data_exportable.to_csv('model2.csv', index=False)

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      3261
           1       0.84      0.60      0.70      1476

    accuracy                           0.84      4737
   macro avg       0.84      0.77      0.79      4737
weighted avg       0.84      0.84      0.83      4737



In [6]:
# vectorizer = TfidfVectorizer()
# X_train_tfidf_vec = vectorizer.fit_transform(X_train['Text'])
# X_val_tfidf_vec = vectorizer.transform(X_val['Text'])

# model = RandomForestClassifier(n_estimators=100, random_state=1).fit(X_train_tfidf_vec, y_train)
# y_train_pred = model.predict(X_train_tfidf_vec)
# y_val_pred = model.predict(X_val_tfidf_vec)

# print(classification_report(y_val, y_val_pred))

# test_data_exportable = test_data.copy()
# X_test_vec = vectorizer.transform(X_test['Text'])
# y_test_pred = model.predict(X_test_vec)
# test_data_exportable['Label'] = y_test_pred
# test_data_exportable = test_data_exportable[['ID', 'Label']]
# test_data_exportable.to_csv('model3.csv', index=False)

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      3261
           1       0.83      0.61      0.70      1476

    accuracy                           0.84      4737
   macro avg       0.83      0.77      0.79      4737
weighted avg       0.84      0.84      0.83      4737



In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(stop_words='english'), 'Text'),
        ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post'])
    ]
)

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        #('regressor', LogisticRegression(max_iter=3000, random_state=1)),
        ('regressor', RandomForestClassifier(max_iter=3000, random_state=1))
    ]
)

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)
y_val_pred = pipeline.predict(X_val)

print(classification_report(y_val, y_val_pred))

test_data_exportable = test_data.copy()
X_test_vec = vectorizer.transform(X_test['Text'])
y_test_pred = model.predict(X_test_vec)
test_data_exportable['Label'] = y_test_pred
test_data_exportable = test_data_exportable[['ID', 'Label']]
test_data_exportable.to_csv('model4.csv', index=False)

TypeError: RandomForestClassifier.__init__() got an unexpected keyword argument 'max_iter'