#**NLP PROJECT :  AUTOMATIC TRANSLATION**


###Importing libraries
---

In [1]:
import nltk
import numpy as np
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import joblib


###Dataset
---
English-French Corpus


In [2]:
#load the English-French dataset
data = pd.read_csv('/content/drive/MyDrive/english_french_dataset.csv', nrows=1000)
#Rename our two coloumns as english and french
data = data.set_axis (['english', 'french'], axis = 1) 

###Preprocessing
---

In [3]:
def preprocess(text):
    # Remove punctuations
    text = ''.join([i for i in text if i not in string.punctuation])
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join the tokens back into a string
    text = ' '.join(tokens)
    return text

In [4]:
#Apply the preprocessing to english data
data['english'] = data['english'].apply(preprocess)

###Vectorization
---

In [5]:
vectorizer = TfidfVectorizer()

In [6]:
#create an input vectors of english data
input_vectors = vectorizer.fit_transform(data['english'])

###Knowledge discovery
---

In [7]:
#Create an SVM model
svm_model = SVC(kernel='linear', C=1, random_state=42)

In [8]:
#Train the SVM model
svm_model.fit(input_vectors, data['french'])

###Save the model
---

In [9]:
# Save the trained model as svm_model.pkl
joblib.dump(svm_model, 'svm_model.pkl')

# Save the vectorizer as vectorizer.pkl
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

###Testing stage
---

In [10]:
import joblib

# Load the saved SVM model
svm_model = joblib.load('svm_model.pkl')

# Load the saved vectorizer
vectorizer = joblib.load('vectorizer.pkl')

In [14]:
def get_translation(sentence):
    sentence_vector = vectorizer.transform([sentence])
    predicted_output = svm_model.predict(sentence_vector)
    return predicted_output[0]

In [15]:
user_sentence = input("Sentence to translate in French: ")
response = get_translation(user_sentence)
print('sentence Translate: ',response)

Sentence to translate in French: What is light ?
sentence Translate:  Qu’est-ce que la lumière?
