In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv('cefr_leveled_texts.csv')
df

Unnamed: 0,text,label
0,Hi!\nI've been meaning to write for ages and f...,B2
1,﻿It was not so much how hard people found the ...,B2
2,Keith recently came back from a trip to Chicag...,B2
3,"The Griffith Observatory is a planetarium, and...",B2
4,-LRB- The Hollywood Reporter -RRB- It's offici...,B2
...,...,...
1489,Light propagating in the vicinity of astrophys...,C2
1490,Future of dentistry has become one of the most...,C2
1491,﻿The forests – and suburbs – of Europe are ech...,C2
1492,Hedge funds are turning bullish on oil once ag...,C2


In [10]:
#handle missing data (non-numerical columns)
df.isnull().sum()


text     0
label    0
dtype: int64

In [11]:
#identify dependent and independent variables
X = df['text']  # ind var
y = df['label']  # dep var (target)

X.head(), y.head()

(0    Hi!\nI've been meaning to write for ages and f...
 1    ﻿It was not so much how hard people found the ...
 2    Keith recently came back from a trip to Chicag...
 3    The Griffith Observatory is a planetarium, and...
 4    -LRB- The Hollywood Reporter -RRB- It's offici...
 Name: text, dtype: object,
 0    B2
 1    B2
 2    B2
 3    B2
 4    B2
 Name: label, dtype: object)

In [12]:
#text cleaning
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

def clean_text(text):
    #remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    #change text to lowercase
    text = text.lower()
    #remove english stop words
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    
    return text

# apply clean_text function
X_cleaned = X.apply(clean_text)

X_cleaned.head()


0    hi ive meaning write ages finally today im act...
1    hard people challenge far avoid left researche...
2    keith recently came trip chicago illinois midw...
3    griffith observatory planetarium exhibit hall ...
4    lrb hollywood reporter rrb official amcs walki...
Name: text, dtype: object

In [13]:
#label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# get encoded labels (convert CEFR levels to integers for ease of processing)
y_encoded = label_encoder.fit_transform(y)

label_comparison = pd.DataFrame({'Original Label': y, 'Encoded Label': y_encoded})

label_comparison.sample(10, random_state=1)

Unnamed: 0,Original Label,Encoded Label
91,B2,3
75,B2,3
1209,A1,0
330,A2,1
959,B1,2
742,C1,4
893,B1,2
1194,A1,0
462,A2,1
335,A2,1


In [14]:
#text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

#init vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=6000, ngram_range=(1, 3))

# fit and transform text to get vectors
X_vectorized = tfidf_vectorizer.fit_transform(X_cleaned)

# show shape of sparse matrix (to understand its dimensions)
X_vectorized.shape


(1494, 5000)

In [15]:
#split data (between training set and validation set)
X_train, X_val, y_train, y_val = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

#show training and validation sets
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1195, 5000), (299, 5000), (1195,), (299,))

In [16]:
#init multinomial naive bayes classifier 
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()

In [17]:
#train the model with the pre-processed data
nb_classifier.fit(X_train, y_train)

In [18]:
#time to make predictions on the validation data set
y_pred = nb_classifier.predict(X_val)
y_pred

array([4, 3, 4, 3, 3, 3, 1, 1, 1, 2, 0, 3, 0, 5, 3, 1, 3, 3, 0, 3, 1, 3,
       3, 4, 1, 3, 3, 3, 0, 3, 3, 1, 5, 3, 3, 3, 1, 5, 0, 3, 3, 3, 1, 1,
       1, 1, 1, 0, 4, 1, 0, 1, 3, 1, 3, 3, 2, 4, 3, 1, 4, 3, 1, 3, 5, 4,
       0, 1, 1, 3, 3, 0, 3, 3, 0, 3, 0, 1, 0, 1, 0, 3, 3, 4, 3, 1, 0, 0,
       4, 1, 0, 5, 0, 4, 1, 5, 0, 3, 0, 1, 3, 4, 4, 3, 5, 3, 4, 3, 1, 3,
       1, 1, 3, 1, 3, 0, 0, 4, 0, 0, 5, 4, 3, 4, 3, 4, 0, 3, 3, 0, 3, 3,
       0, 1, 2, 3, 0, 3, 0, 3, 5, 3, 3, 1, 3, 1, 1, 5, 3, 0, 0, 0, 5, 1,
       1, 3, 0, 1, 3, 0, 1, 3, 0, 3, 3, 1, 0, 0, 3, 4, 1, 1, 4, 3, 1, 4,
       4, 0, 4, 3, 3, 4, 0, 3, 5, 4, 1, 0, 1, 0, 1, 1, 0, 1, 3, 0, 4, 1,
       5, 1, 5, 3, 1, 0, 1, 3, 4, 3, 0, 3, 3, 0, 3, 1, 0, 4, 4, 4, 3, 3,
       3, 1, 1, 3, 3, 3, 4, 1, 0, 3, 0, 0, 5, 4, 5, 2, 0, 2, 3, 3, 1, 0,
       4, 0, 4, 1, 4, 0, 1, 0, 0, 3, 3, 5, 1, 3, 3, 0, 0, 3, 3, 5, 2, 5,
       1, 0, 1, 1, 1, 0, 3, 1, 4, 3, 1, 3, 0, 3, 0, 1, 1, 3, 1, 3, 0, 3,
       0, 3, 4, 3, 1, 3, 0, 0, 2, 1, 0, 3, 3])

In [20]:
#determine accuracy of model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_val, y_pred)
accuracy

0.5284280936454849