#           **Project Overview**


1.   *Extraction of Scrap data*
2.   *Applying* *NLp* *text* *preprocessing* *techinques*
3.   *Applied lemmatization Techinque*
4.   *Conversion of words to Vectors*
5.   *Language translation using hugging face ai*

In [13]:
import requests
from bs4 import BeautifulSoup

def scrape_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extracting  all paragraph from corpus texts
    paragraphs = [p.get_text() for p in soup.find_all('p')]
    return paragraphs

# Example usage
url = 'https://en.wikipedia.org/wiki/MS_Dhoni'
paragraphs = scrape_text(url)
print(paragraphs)


['\n', 'Mahendra Singh Dhoni (/məˈheɪndrə ˈsɪŋ dhæˈnɪ/ ⓘ; born 7 July 1981) is an Indian professional cricketer who plays as a right-handed batter and a wicket-keeper. Widely regarded as one of the most prolific wicket-keeper batsmen and captains, he represented the Indian cricket team and was the captain of the side in limited overs formats from 2007 to 2017 and in test cricket from 2008 to 2014. Dhoni has captained the most international matches and is the most successful Indian captain. He has led India to victory in the 2007 ICC World Twenty20, the 2011 Cricket World Cup, and the 2013 ICC Champions Trophy, being the only captain to win three different limited overs ICC tournaments. He also led the teams that won the Asia Cup in 2010, 2016 and was a member of the title winning squad in 2018.\n', 'Born in Ranchi, Dhoni made his first class debut for Bihar in 1999. He made his debut for the Indian cricket team on 23 December 2004 in an ODI against Bangladesh and played his first test 

In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(paragraphs):
    stop_words = set(stopwords.words('english'))
    cleaned_paragraphs = []

    for paragraph in paragraphs:
        # Convert to lowercase
        paragraph = paragraph.lower()
        # Removal  of special characters and digits
        paragraph = re.sub(r'[^a-z\s]', '', paragraph)
        # Tokenization
        tokens = word_tokenize(paragraph)
        # Removal of stopwords
        filtered_tokens = [word for word in tokens if word not in stop_words]
        cleaned_paragraphs.append(filtered_tokens)

    return cleaned_paragraphs

cleaned_data = preprocess_text(paragraphs)
print(cleaned_data)


[[], ['mahendra', 'singh', 'dhoni', 'mhendr', 'dhn', 'born', 'july', 'indian', 'professional', 'cricketer', 'plays', 'righthanded', 'batter', 'wicketkeeper', 'widely', 'regarded', 'one', 'prolific', 'wicketkeeper', 'batsmen', 'captains', 'represented', 'indian', 'cricket', 'team', 'captain', 'side', 'limited', 'overs', 'formats', 'test', 'cricket', 'dhoni', 'captained', 'international', 'matches', 'successful', 'indian', 'captain', 'led', 'india', 'victory', 'icc', 'world', 'twenty', 'cricket', 'world', 'cup', 'icc', 'champions', 'trophy', 'captain', 'win', 'three', 'different', 'limited', 'overs', 'icc', 'tournaments', 'also', 'led', 'teams', 'asia', 'cup', 'member', 'title', 'winning', 'squad'], ['born', 'ranchi', 'dhoni', 'made', 'first', 'class', 'debut', 'bihar', 'made', 'debut', 'indian', 'cricket', 'team', 'december', 'odi', 'bangladesh', 'played', 'first', 'test', 'year', 'later', 'sri', 'lanka', 'became', 'captain', 'odi', 'side', 'taking', 'formats', 'dhoni', 'retired', 'test

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
## Applying  lemmatization
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

def lemmatize_words(cleaned_data):
    lemmatizer = WordNetLemmatizer()
    lemmatized_data = []

    for tokens in cleaned_data:
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        lemmatized_data.append(lemmatized_tokens)

    return lemmatized_data

lemmatized_data = lemmatize_words(cleaned_data)
print(lemmatized_data)


[[], ['mahendra', 'singh', 'dhoni', 'mhendr', 'dhn', 'born', 'july', 'indian', 'professional', 'cricketer', 'play', 'righthanded', 'batter', 'wicketkeeper', 'widely', 'regarded', 'one', 'prolific', 'wicketkeeper', 'batsman', 'captain', 'represented', 'indian', 'cricket', 'team', 'captain', 'side', 'limited', 'over', 'format', 'test', 'cricket', 'dhoni', 'captained', 'international', 'match', 'successful', 'indian', 'captain', 'led', 'india', 'victory', 'icc', 'world', 'twenty', 'cricket', 'world', 'cup', 'icc', 'champion', 'trophy', 'captain', 'win', 'three', 'different', 'limited', 'over', 'icc', 'tournament', 'also', 'led', 'team', 'asia', 'cup', 'member', 'title', 'winning', 'squad'], ['born', 'ranchi', 'dhoni', 'made', 'first', 'class', 'debut', 'bihar', 'made', 'debut', 'indian', 'cricket', 'team', 'december', 'odi', 'bangladesh', 'played', 'first', 'test', 'year', 'later', 'sri', 'lanka', 'became', 'captain', 'odi', 'side', 'taking', 'format', 'dhoni', 'retired', 'test', 'cricket

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
## Conversion  words to vectors
from gensim.models import Word2Vec

model = Word2Vec(lemmatized_data, vector_size=100, window=5, min_count=1, workers=4)
word_vector = model.wv['format']
print(word_vector)


[-0.00317504  0.00573897 -0.00142198 -0.00915274 -0.00310505  0.00562479
  0.00589788  0.00621275  0.00548826 -0.00965081  0.00105171  0.0026318
 -0.00251125 -0.00251922  0.00678768 -0.00891543  0.00337167  0.0033079
  0.00184232  0.00226244 -0.00615552  0.00297843 -0.00095098 -0.00429569
  0.00850656 -0.00730724 -0.00816996 -0.00664485 -0.00933698 -0.00204625
 -0.00783348 -0.00432039  0.0062664  -0.00269609 -0.00638996 -0.00262955
 -0.00703793  0.00773022  0.00319601 -0.00158107  0.00964481  0.00680529
 -0.00551196 -0.00778286  0.00673942  0.00556076 -0.00331706  0.00574781
  0.00935071  0.00318305 -0.00545307 -0.00193431 -0.00714387  0.00801256
 -0.00509104  0.00440934  0.0064543  -0.00086805 -0.00132705  0.00829709
 -0.00466981  0.01026126  0.00406914  0.00313934 -0.00376625 -0.00245612
 -0.00385999 -0.00629279  0.00578805 -0.00268205 -0.00961312  0.00925123
  0.00435389  0.00118886  0.00222384 -0.00337228  0.00246298  0.00646189
  0.00851363 -0.00123444  0.00894905  0.00366318  0.0

In [17]:
from collections import Counter


In [18]:
def frequency_ranking(model):
    # Get the vocabulary from the Word2Vec model
    vocabulary = model.wv.index_to_key

    # frequency calcaution based on lemmatizated data
    flattened_lemmatized_data = [word for sublist in lemmatized_data for word in sublist]
    word_frequencies = Counter(flattened_lemmatized_data)

    # ranking based on frequency
    ranked_words = sorted(vocabulary, key=lambda word: word_frequencies[word], reverse=True)

    return ranked_words, word_frequencies

ranked_words, word_frequencies = frequency_ranking(model)

print("Word Frequencies:", word_frequencies)
print("Ranked Words by Frequency:", ranked_words)

Word Frequencies: Counter({'dhoni': 126, 'run': 52, 'india': 48, 'series': 39, 'match': 37, 'scored': 35, 'cricket': 33, 'indian': 32, 'odi': 32, 'season': 25, 'captain': 24, 'team': 24, 'first': 24, 'trophy': 21, 'test': 18, 'final': 17, 'wicketkeeper': 16, 'cup': 16, 'led': 15, 'icc': 15, 'world': 15, 'played': 14, 'international': 13, 'tournament': 13, 'also': 13, 'became': 13, 'five': 13, 'one': 12, 'three': 12, 'squad': 12, 'made': 12, 'average': 12, 'csk': 12, 'second': 12, 'inning': 12, 'champion': 11, 'league': 11, 'sri': 10, 'ipl': 10, 'highest': 10, 'south': 10, 'scoring': 10, 'century': 10, 'named': 10, 'victory': 9, 'win': 9, 'title': 9, 'odis': 9, 'third': 9, 'stumping': 9, 'tour': 9, 'australia': 9, 'later': 8, 'lanka': 8, 'including': 8, 'chennai': 8, 'africa': 8, 'play': 7, 'batsman': 7, 'debut': 7, 'bihar': 7, 'december': 7, 'year': 7, 'two': 7, 'army': 7, 'jharkhand': 7, 'zone': 7, 'score': 7, 'record': 7, 'across': 7, 'england': 7, 'west': 7, 'indie': 7, 'new': 7, 'c

In [19]:
pip install transformers torch



In [20]:
from transformers import MarianMTModel, MarianTokenizer


In [22]:
# Function to translate a list of words using Hugging Face Transformers
def translate_words(words, target_language='fr'):
    # Load the model and tokenizer for the target language
    model_name = f'Helsinki-NLP/opus-mt-en-{target_language}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    translations = []
    for word in words:
        # Tokenize and translate
        tokenized_text = tokenizer(word, return_tensors="pt", padding=True)
        translated = model.generate(**tokenized_text)
        translated_word = tokenizer.decode(translated[0], skip_special_tokens=True)
        translations.append(translated_word)

    return translations

# Translate the ranked words
translated_words = translate_words(ranked_words, target_language='fr')  # Translating to French

print("Original_English_Words:", ranked_words)
print("Translated_French_Words:", translated_words)


Original_English_Words: ['dhoni', 'run', 'india', 'series', 'match', 'scored', 'cricket', 'odi', 'indian', 'season', 'team', 'captain', 'first', 'trophy', 'test', 'final', 'wicketkeeper', 'cup', 'led', 'world', 'icc', 'played', 'tournament', 'five', 'also', 'international', 'became', 'made', 'csk', 'inning', 'second', 'squad', 'average', 'three', 'one', 'league', 'champion', 'named', 'scoring', 'century', 'sri', 'highest', 'south', 'ipl', 'victory', 'tour', 'win', 'australia', 'odis', 'stumping', 'title', 'third', 'later', 'lanka', 'including', 'chennai', 'africa', 'england', 'batsman', 'record', 'zone', 'play', 'west', 'indie', 'two', 'army', 'across', 'bihar', 'debut', 'jharkhand', 'year', 'score', 'new', 'december', 'franchise', 'pakistan', 'east', 'captaincy', 'brand', 'four', 'ranji', 'lost', 'time', 'super', 'february', 'million', 'ball', 'u', 'stage', 'winning', 'ranchi', 'cricketer', 'dismissal', 'format', 'twenty', 'zealand', 'bcci', 'limited', 'club', 'asia', 'following', 'ga