<a href="https://colab.research.google.com/github/takakishi/HEC_DS_ML_project/blob/main/src/wrangling_t.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup and Import Data

In [1]:
# Libraries
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# Training and further analysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Import Data
sample_submission = pd.read_csv('https://raw.githubusercontent.com/takakishi/HEC_DS_ML_project/main/data/data_raw/sample_submission.csv')
training_data = pd.read_csv('https://raw.githubusercontent.com/takakishi/HEC_DS_ML_project/main/data/data_raw/training_data.csv')
unlabelled_test_data = pd.read_csv('https://raw.githubusercontent.com/takakishi/HEC_DS_ML_project/main/data/data_raw/unlabelled_test_data.csv')

In [3]:
sample_submission.head()

Unnamed: 0,id,difficulty
0,0,A1
1,1,A1
2,2,A1
3,3,A1
4,4,A1


In [4]:
training_data.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [5]:
unlabelled_test_data.head()

Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,2,"Et, paradoxalement, boire froid n'est pas la b..."
3,3,"Ce n'est pas étonnant, car c'est une saison my..."
4,4,"Le corps de Golo lui-même, d'une essence aussi..."


In [6]:
# Define functions
def preprocess_text(text):
    # Convert to lower case
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Wrangling

In [7]:
# Apply preprocess_text to the sentences
training_data['processed_sentence'] = training_data['sentence'].apply(preprocess_text)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the processed sentences
X = tfidf_vectorizer.fit_transform(training_data['processed_sentence'])
y = training_data['difficulty']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.shape, X_val.shape

((3840, 15739), (960, 15739))

In [9]:
# Logistic regression
log_reg = LogisticRegression(random_state = 42)

# Train the model on the training set
log_reg.fit(X_train, y_train)

# Predict the labels on the validation set
y_val_pred = log_reg.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_val_pred)

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_val_pred, average = 'weighted')

# Confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.440625
Precision: 0.4319099644772127
Recall: 0.440625
F1-Score: 0.42979627229311973
Confusion Matrix:
[[115  30   8   6   3   4]
 [ 57  48  29  10   7   7]
 [ 34  50  42   8  12  20]
 [ 10   5   8  63  37  30]
 [  8   4   8  26  66  40]
 [  8   3  10  26  29  89]]


In [10]:
# Random Forest
rf_clf = RandomForestClassifier(random_state=42)

rf_clf.fit(X_train, y_train)

y_val_pred = rf_clf.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)

precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_val_pred, average='weighted')

conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.4
Precision: 0.39669483705968744
Recall: 0.4
F1-Score: 0.38119318205794434
Confusion Matrix:
[[134  16   8   6   0   2]
 [ 84  35  23  10   4   2]
 [ 50  37  40  16  14   9]
 [ 18   6  16  52  42  19]
 [ 13   3   7  41  65  23]
 [ 12   6  14  28  47  58]]
