In [14]:
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer


In [15]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# 1. Preprocessing

#### 1.1 Read Data

In [16]:
f = open("train.txt",'r')

'''
    Les données sont enregistrés sur df avec deux colonnes 
    colonne 'label' => langage maternelle
    colonne 'doc' => phrases en anglais 
'''
label=[]
phrase=[]
lines = f.readlines()
for l in lines:
    label.append(l[:5])
    phrase.append(l[5:])
labelSet=set(label)
df = pd.DataFrame(list(zip(label, phrase)), columns =['label', 'doc']) 
print(df) 
print(label)

      label                                                doc
0     (GER)   IThe importance and popularity of travelling ...
1     (TUR)   It is an important decision , how to plan you...
2     (CHI)   Some people believe that young people can enj...
3     (TEL)   Travelling is usually considered as good recr...
4     (ARA)   i agree that . Life is a person live period o...
...     ...                                                ...
9895  (CHI)   Nowadays , more and more people go abroad , n...
9896  (KOR)   In accomplishing something that is risky come...
9897  (SPA)   At the beginning of the 21st century , the in...
9898  (HIN)   The number of cars in use across the world ha...
9899  (CHI)   Many people think it is betters to have borad...

[9900 rows x 2 columns]
['(GER)', '(TUR)', '(CHI)', '(TEL)', '(ARA)', '(SPA)', '(GER)', '(GER)', '(HIN)', '(JPN)', '(KOR)', '(CHI)', '(FRE)', '(TUR)', '(SPA)', '(FRE)', '(JPN)', '(ARA)', '(KOR)', '(ITA)', '(TUR)', '(TEL)', '(ARA)', '(SPA)', '(

#### 1.2 Clean Data

In [17]:
stopwords = set(stopwords.words('english'))

def clean_phrase(text):
    """
    Clean the input text 
    """
    cleaned_text = text.lower()
    cleaned_text =  re.sub(r'^https?:\/\/.*[\r\n]*', '', cleaned_text)
    cleaned_text = re.sub('<.*>', '', cleaned_text)
    cleaned_text = re.sub('[^a-zA-Z0-9\s]', '', cleaned_text)
    cleaned_text = cleaned_text.split()
    cleaned_text = [word for word in cleaned_text if word not in stopwords and len(word) > 1]
    
    return " ".join(cleaned_text)
df["doc"] = df["doc"].apply(lambda x: clean_phrase(x))


#### 1.3 LabelEncoder -- Prepare Y

In [18]:
'''
X est données des doc
Y est label apres encoder
'''

X=df["doc"]
Y=df["label"].tolist()
le = LabelEncoder()
le.fit(df.label.unique())
langue = le.classes_
Y=le.fit_transform(Y)

langue_encoder = [(langue[i],i) for i in range(len(langue))]
print(langue_encoder)



[('(ARA)', 0), ('(CHI)', 1), ('(FRE)', 2), ('(GER)', 3), ('(HIN)', 4), ('(ITA)', 5), ('(JPN)', 6), ('(KOR)', 7), ('(SPA)', 8), ('(TEL)', 9), ('(TUR)', 10)]


#### 1.4 TF-IDF -- Prepare X

In [26]:
#tf = TfidfVectorizer(stop_words='english')
tf = TfidfVectorizer(ngram_range=(2,2))
tf_X = tf.fit_transform(X)

In [20]:
cv = CountVectorizer(stop_words='english')
cv_X = cv.fit_transform(X)

# Modele

In [30]:
def model_cross_validation(X,y,k=5):
    kfold = KFold(n_splits = k, shuffle = True, random_state=42)
    svm_classifier = SVC(kernel='linear', C=1, random_state = 30)
    
    scores =[]
    i=0

    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    

        svm_classifier.fit(X_train, y_train)
        y_pred = svm_classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        print(i,accuracy)
        scores.append(accuracy)
        i+=1
        
    # Print the accuracy for each fold
    for i, score in enumerate(scores, start=1):
        print(f"Fold {i}: {score:.4f}")
        
    average_accuracy = np.mean(scores)
    print(f"Average accuracy: {average_accuracy:.4f}")
    
#model_cross_validation(X,Y)
    

In [31]:
model_cross_validation(tf_X,Y,5)

0 0.5050505050505051
1 0.48383838383838385
2 0.503030303030303
3 0.49747474747474746
4 0.5106060606060606
Fold 1: 0.5051
Fold 2: 0.4838
Fold 3: 0.5030
Fold 4: 0.4975
Fold 5: 0.5106
Average accuracy: 0.5000


In [51]:
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [22]:
def model_cross_validation_lr(X,y,k=5):
    kfold = KFold(n_splits = k, shuffle = True, random_state=42)
    lr = LogisticRegression(max_iter= 2000)
    scores =[]

    for train_index, test_index in kfold.split(X):
        print(train_index,test_index)
        
        X_train, X_test =  X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        tfidf_lr = lr.fit(X_train, y_train)
        y_pred = tfidf_lr.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        scores.append(accuracy)
        
    # Print the accuracy for each fold
    for i, score in enumerate(scores, start=1):
        print(f"Fold {i}: {score:.4f}")
        
    average_accuracy = np.mean(scores)
    print(f"Average accuracy: {average_accuracy:.4f}")
    
#model_cross_validation(X,Y)

In [23]:
model_cross_validation_lr(cv_X,Y)

[   1    2    4 ... 9895 9897 9899] [   0    3    8 ... 9893 9896 9898]
[   0    1    2 ... 9896 9897 9898] [  26   27   41 ... 9862 9877 9899]
[   0    1    3 ... 9897 9898 9899] [   2   15   28 ... 9885 9890 9895]
[   0    1    2 ... 9897 9898 9899] [   6    7   16 ... 9891 9892 9894]
[   0    2    3 ... 9896 9898 9899] [   1    4    5 ... 9887 9888 9897]
Fold 1: 0.6293
Fold 2: 0.6313
Fold 3: 0.6384
Fold 4: 0.6500
Fold 5: 0.6348
Average accuracy: 0.6368


In [16]:
index= [1,2]
t = [1,2,3,4]

In [21]:
for train_index, test_index in KFold(n_splits = 3, shuffle = True, random_state=42).split(X):
    print(X[train_index])

1       important decision plan syllabus students pref...
2       people believe young people enjoy life older p...
4       agree life person live period time people beli...
5       opinion travel group tour guide good way trave...
6       thing statement young people nowadays give eno...
                              ...                        
9891    plan trip would like travel group led tour gui...
9892    mind statement hand important issue discuss pe...
9894    learing fact human always curious knowing fact...
9895    nowadays people go abroad matter children old ...
9897    beginning 21st century increasing number cars ...
Name: doc, Length: 6600, dtype: object
0       ithe importance popularity travelling still ra...
1       important decision plan syllabus students pref...
3       travelling usually considered good recreation ...
4       agree life person live period time people beli...
5       opinion travel group tour guide good way trave...
                              ...

In [33]:
import tensorflow as tf