# Atelier : NLP Manipulation

## Part I

### Text Cleaning

In [5]:
#Import libraries
import nltk
import string
import re
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.corpus import stopwords
from google.colab import drive
#drive.mount('drive/my-drive')
import pandas as pd

In [30]:
#load dataset
df = pd.read_csv("crypto.csv")
df.head()

Unnamed: 0,id,title,content,tags
0,3,What are the benefits of the two permutation t...,<p>Why do we use a permutation table in the fi...,block-cipher des permutation
1,7,Why use a 1-2 Oblivious Transfer instead of a ...,"<p>When initiating an <a href=""http://en.wikip...",oblivious-transfer multiparty-computation func...
2,8,Why do we append the length of the message in ...,"<p>As we know, <a href=""http://en.wikipedia.or...",sha-1 hash
3,9,What is the general justification for the hard...,<p>Since most cryptographic hash functions are...,hash cryptanalysis preimage-resistance
4,14,"How can I use asymmetric encryption, such as R...",<p>RSA is not designed to be used on long bloc...,encryption rsa public-key


In [9]:
#No let's check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10432 entries, 0 to 10431
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       10432 non-null  int64 
 1   title    10432 non-null  object
 2   content  10432 non-null  object
 3   tags     10432 non-null  object
dtypes: int64(1), object(3)
memory usage: 326.1+ KB


In [31]:
#Since the type of data are object let's transform them to String so we can easily manipulate them later
df['title'] = df['title'].astype('string')
df['content'] = df['content'].astype('string')
df['tags'] = df['tags'].astype('string')

In [32]:
df['tags']

0                             block-cipher des permutation
1        oblivious-transfer multiparty-computation func...
2                                               sha-1 hash
3                   hash cryptanalysis preimage-resistance
4                                encryption rsa public-key
                               ...                        
10427                       hash hmac collision-resistance
10428                                         hash padding
10429                      perfect-secrecy forward-secrecy
10430                                  hash diffie-hellman
10431                    encryption aes symmetric key-wrap
Name: tags, Length: 10432, dtype: string

In [35]:
def clean_data(data):
# 1- Convert data to lower-case 
    data = data.lower() 
# 2- Remove URLs
    data = re.sub(r'https?://\S+|www\.\S+', '', data)
# 3- Remove HTML characters
    data = re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '', data) 
# 4- Remove Characters in between Texts (Non alphabetical characters)
    data = re.sub(r'[^A-Za-z0-9]+', ' ', data) 
# 5- Remove Punctuation
    data = re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', ' ', data) 
# 6- Remove “\n”
    data = data.replace('\n', '').replace('\t','').replace('\\', '')
# 7- Remove unnecessary white spaces in between words
    #data = data.strip()
    data = ' '.join(word for word in data.split())
    
    return data

In [38]:
#Let's start cleaning our data
df['tags'] = df['tags'].map(lambda x: clean_data(x))
df['title'] = df['title'].map(lambda x: clean_data(x))
df['content'] = df['content'].map(lambda x: clean_data(x))
df

Unnamed: 0,id,title,content,tags
0,3,what are the benefits of the two permutation t...,why do we use a permutation table in the first...,block cipher des permutation
1,7,why use a 1 2 oblivious transfer instead of a ...,when initiating an why would someone use a rat...,oblivious transfer multiparty computation func...
2,8,why do we append the length of the message in ...,as we know,sha 1 hash
3,9,what is the general justification for the hard...,since most cryptographic hash functions are si...,hash cryptanalysis preimage resistance
4,14,how can i use asymmetric encryption such as rs...,rsa is not designed to be used on long blocks ...,encryption rsa public key
...,...,...,...,...
10427,39758,prevent hash collisions with hmac secret as iv,our organistation would like to hash large fil...,hash hmac collision resistance
10428,39759,can a modified executable file be padded to gi...,md5 or sha1 checksums seem to be the gold stan...,hash padding
10429,39761,definitions of secrecy,i found terms like forward secrecy future secr...,perfect secrecy forward secrecy
10430,39762,what is a ratchet,while reading whatsapp s security whitepaper i...,hash diffie hellman


### Text Preprocessing

#### Tokenization

In [39]:
#import libraries
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
Tokens_Title = df["title"].apply(nltk.word_tokenize).tolist()
Tokens_Content = df["content"].apply(nltk.word_tokenize).tolist()
Tokens_Tags = df["tags"].apply(nltk.word_tokenize).tolist()

##### 1 -Title Column

In [44]:
#Apply the Tokenization with Bigrams
Bigram_Token_Title = []
Trigram_Token_Title = []
Ngram_Token_Title = []

for item in Tokens_Title:
  bigram_token = list(nltk.bigrams(item))
  Bigram_Token_Title.append(bigram_token)
#Apply the Tokenization with Ngrams
for item in Tokens_Title:
  ngram_token = list(nltk.ngrams(item,6))
  Ngram_Token_Title.append(ngram_token)
#Apply the Tokenization with Trigrams
for item in Tokens_Title:
  trigram_token = list(nltk.trigrams(item))
  Trigram_Token_Title.append(trigram_token)


##### 2 -Content Column

In [45]:
Bigram_Token_Content = []
Trigram_Token_Content = []
Ngram_Token_Content = []
#Apply the Tokenization with Bigrams
for item in Tokens_Content:
  bigram_token = list(nltk.bigrams(item))
  Bigram_Token_Content.append(bigram_token)
#Apply the Tokenization with Ngrams
for item in Tokens_Content:
  ngram_token = list(nltk.ngrams(item,6))
  Ngram_Token_Content.append(ngram_token)
#Apply the Tokenization with Trigrams
for item in Tokens_Content:
  trigram_token = list(nltk.trigrams(item))
  Trigram_Token_Content.append(trigram_token)

##### 3 -Tags Column

In [52]:
Bigram_Token_Tags = []
Trigram_Token_Tags = []
Ngram_Token_Tags = []
#Apply the Tokenization with Bigrams
for item in Tokens_Tags:
  bigram_token = list(nltk.bigrams(item))
  Bigram_Token_Tags.append(bigram_token)
#Apply the Tokenization with Ngrams
for item in Tokens_Tags:
  ngram_token = list(nltk.ngrams(item,6))
  Ngram_Token_Tags.append(ngram_token)
#Apply the Tokenization with Trigrams
for item in Tokens_Tags:
  trigram_token = list(nltk.trigrams(item))
  Trigram_Token_Tags.append(trigram_token)

In [53]:
Bigram_Token_Tags

[[('block', 'cipher'), ('cipher', 'des'), ('des', 'permutation')],
 [('oblivious', 'transfer'),
  ('transfer', 'multiparty'),
  ('multiparty', 'computation'),
  ('computation', 'function'),
  ('function', 'evaluation')],
 [('sha', '1'), ('1', 'hash')],
 [('hash', 'cryptanalysis'),
  ('cryptanalysis', 'preimage'),
  ('preimage', 'resistance')],
 [('encryption', 'rsa'), ('rsa', 'public'), ('public', 'key')],
 [('des', 'encryption'), ('encryption', 's'), ('s', 'boxes')],
 [('dsa', 'bitcoin'),
  ('bitcoin', 'digital'),
  ('digital', 'cash'),
  ('cash', 'ripemd')],
 [('hash', 'implementation'), ('implementation', 'salt')],
 [('encryption', 'aes'),
  ('aes', 'block'),
  ('block', 'cipher'),
  ('cipher', 'key'),
  ('key', 'size')],
 [('hash', 'passwords')],
 [('cryptanalysis', 'block'),
  ('block', 'cipher'),
  ('cipher', 'differential'),
  ('differential', 'analysis')],
 [('coding', 'theory'),
  ('theory', 'elliptic'),
  ('elliptic', 'curves'),
  ('curves', 'hermitian'),
  ('hermitian', 'cur

#### What is the difference between the 3 techniques?

N-grams are phrases made up of N consecutive words that are taken from a sentence. As a result, a Unigram takes a statement and provides us with all the words in it. A bigram extracts sets of two consecutive words from a sentence and returns them to us. A trigram is a grouping of three words that follow one another in a sentence.

#### Apply Stop words with the English vocabulary

In [47]:
#import 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [48]:
Title_No_Stopwords = []
Content_No_Stopwords = []
Tags_No_Stopwords = []

for i in Tokens_Title:
  STOPWORDS = set(stopwords.words('english'))
  i = ' '.join(word for word in i if word not in STOPWORDS)  
  Title_No_Stopwords.append(i)

for i in Tokens_Content:
  STOPWORDS = set(stopwords.words('english'))
  i = ' '.join(word for word in i if word not in STOPWORDS)
  Content_No_Stopwords.append(i)

for i in Tokens_Tags:
  STOPWORDS = set(stopwords.words('english'))
  i = ' '.join(word for word in i if word not in STOPWORDS)
  Tags_No_Stopwords.append(i)


In [50]:
Title_No_Stopwords

['benefits two permutation tables des',
 'use 1 2 oblivious transfer instead 1 n oblivious transfer',
 'append length message sha 1 pre processing',
 'general justification hardness finding preimages cryptographic hash functions',
 'use asymmetric encryption rsa encrypt arbitrary length plaintext',
 'des box values determined',
 'provide secure vanity bitcoin address service',
 'random salt hash function work practice',
 'practical differences 256 bit 192 bit 128 bit aes encryption',
 'makes hash function good password hashing',
 'apply differential cryptanalysis block cipher',
 'current mathematics theory used cryptography coding theory',
 'ssl secure two way communication one key pair',
 'necessity randomness salts',
 'main weaknesses playfair cipher',
 'salting encrypting',
 'sufficient randomness xor acceptable mechanism encrypting',
 'feasible build stream cipher cryptographic hash function',
 'known methods constant time table free aes implementation using standard operations',
 

#### Apply Stemming

In [49]:
#import libraries
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [55]:
ps = PorterStemmer()
Tokens_Title_Stemmed = []
for i in Tokens_Title:
  stemmed_words = []
  for word in i:
    stemmed = ps.stem(word)
    stemmed_words.append(stemmed)
  Tokens_Title_Stemmed.append(stemmed_words)

#### Apply lemmatization

In [58]:
#download packages
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [59]:
Tokens_Title_lemmatized = []
lemmatizer = WordNetLemmatizer()
for i in Tokens_Title:
  lemmatized_words = []
  for word in i:
    lem = lemmatizer.lemmatize(word)
    lemmatized_words.append(lem)
  Tokens_Title_lemmatized.append(set(lemmatized_words))

#### What is the difference between Stemming and lemmatization?

The main difference between them is that Stemming uses the stem of the word, while lemmatization uses the context in which the word is being used.

#### Apply the Pos tagging technique

In [60]:
#import and download packages
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [73]:
#Title
Postag_Title_Tokens = []
for i in Tokens_Title:
  pos_tag_token = pos_tag(i)
  Postag_Title_Tokens.append(pos_tag_token)
#Content
Postag_Content_Tokens = []
for i in Tokens_Content:
  pos_tag_token = pos_tag(i)
  Postag_Content_Tokens.append(pos_tag_token)
#Tags
Postag_Tags_Tokens = []
for i in Tokens_Tags:
  pos_tag_token = pos_tag(i)
  Postag_Tags_Tokens.append(pos_tag_token)

In [None]:
Postag_Title_Tokens

In [None]:
Postag_Content_Tokens

In [None]:
Postag_Tags_Tokens

#### Implement a NER (Named Entity Recognition) model

In [62]:
#import library
import spacy

In [64]:
NER = nlp = spacy.load('en_core_web_sm')

In [None]:
#Apply NER on title
Ner_list=[]
for i in Tokens_Title[0:100]:
  for word in i:
    new_word = NER(word)
    for ent in new_word.ents:
      word_ner = (ent.text,ent.label_)
      Ner_list.append(word_ner)
Ner_list

In [None]:
#Apply NER on content
Ner_list_content=[]
for i in Tokens_Content[0:100]:
  for word in i:
    new_word = NER(word)
    for ent in new_word.ents:
      word_ner = (ent.text,ent.label_)
      Ner_list_content.append(word_ner)
Ner_list_content

In [None]:
#Apply NER on tags
Ner_list_tags=[]
for i in Tokens_Tags[0:100]:
  for word in i:
    new_word = NER(word)
    for ent in new_word.ents:
      word_ner = (ent.text,ent.label_)
      Ner_list_tags.append(word_ner)
Ner_list_tags

#### Identify the parts of the speech using Chunking

In [77]:
chunker = nltk.RegexpParser(
   r'''
   NP:{<DT><NN.*><.*>*<NN.*>}
   }<VB.*>{
   '''
)
chunking_Content_Tokens = []
for i in Tokens_Content:
  pos_tag_token = chunker.parse(i)
  Postag_Content_Tokens.append(pos_tag_token)

ValueError: ignored

#### Implement a TF-IDF function

In [78]:
# import libraries
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [80]:
vect = TfidfVectorizer()
weight_matrix = vect.fit_transform(Title_No_Stopwords)
pd.DataFrame(weight_matrix.toarray(),columns=vect.get_feature_names())



Unnamed: 0,00,01,05,0bbbbbbb,0d,0e329232ea6d0d73,0s,0x00,0x36,0x5c,...,zk,zkip,zkpok,zkpps,zlib,zone,zp,zrtp,ztarman,zuc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
feature_names = vect.get_feature_names()
df_words_weights = pd.DataFrame()
df_words_weights.insert(0,"word",feature_names)

element = []
for i in feature_names:   
    element.append(df_weights[i].max())
df_words_weights.insert(1,"word_weight",element) 

df_words_weights

Unnamed: 0,word,word_weight
0,00,0.596322
1,01,0.533841
2,05,0.374300
3,0bbbbbbb,0.529118
4,0d,0.295590
...,...,...
6237,zone,0.355753
6238,zp,0.456839
6239,zrtp,0.382575
6240,ztarman,0.364787


#### Implement a Word Embedding function

In [87]:
#import library
from gensim.models import Word2Vec as w2v

In [88]:
flat_list_Tokens_Title = [item for sublist in Tokens_Title for item in sublist]
w2v = w2v(
    Tokens_Title,
    min_count=3,  
    sg = 1,       
    window=7      
)  
w2v['encryption']

  


array([ 0.2316566 , -0.2984453 , -0.10506061, -0.18209723,  0.04116612,
        0.31047317, -0.13097131,  0.11174038,  0.01166914, -0.16521014,
       -0.3578333 , -0.1939854 ,  0.08761793,  0.4736409 , -0.06254897,
       -0.40349045, -0.00837329,  0.17299818,  0.07126163,  0.12753049,
       -0.1368329 , -0.26224658, -0.18081003, -0.40786144, -0.14210244,
        0.01141309, -0.02124221,  0.0255001 , -0.2605694 ,  0.20875907,
        0.00349309, -0.07094663,  0.19422354,  0.2583012 ,  0.10109249,
        0.02559628, -0.35679176,  0.22967067,  0.05090909,  0.11299002,
        0.01251698, -0.0893384 , -0.00387137, -0.12521207, -0.39538813,
        0.11443657,  0.40207297,  0.22277977, -0.16029751,  0.1941529 ,
        0.34993425, -0.05590459,  0.13883665, -0.10806964,  0.14963026,
        0.09215766, -0.21821247, -0.02208439,  0.35760647,  0.09008313,
       -0.26957887,  0.353228  , -0.10914461,  0.10912058, -0.3451968 ,
       -0.25884324,  0.35656065, -0.29418865,  0.11325905,  0.11

In [89]:
words_embeddings = list(w2v.wv.vocab)
w2v.wv.vocab

{'what': <gensim.models.keyedvectors.Vocab at 0x7f074e43f8d0>,
 'are': <gensim.models.keyedvectors.Vocab at 0x7f074e43f550>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7f074e46b050>,
 'benefits': <gensim.models.keyedvectors.Vocab at 0x7f074e46b090>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7f074e4d2d90>,
 'two': <gensim.models.keyedvectors.Vocab at 0x7f074e45eb90>,
 'permutation': <gensim.models.keyedvectors.Vocab at 0x7f074e45e3d0>,
 'tables': <gensim.models.keyedvectors.Vocab at 0x7f074e45e9d0>,
 'in': <gensim.models.keyedvectors.Vocab at 0x7f074e46b510>,
 'des': <gensim.models.keyedvectors.Vocab at 0x7f074e45e150>,
 'why': <gensim.models.keyedvectors.Vocab at 0x7f074e45e810>,
 'use': <gensim.models.keyedvectors.Vocab at 0x7f074e45ead0>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f074e45e590>,
 '1': <gensim.models.keyedvectors.Vocab at 0x7f074e45e390>,
 '2': <gensim.models.keyedvectors.Vocab at 0x7f074e45e950>,
 'oblivious': <gensim.models.keyedvectors.Vocab at 0x7f074e45

#### What is the difference between TF-IDF and Word Embedding?

The primary distinction between TF-IDF and word2vec is that TF-IDF is a statistical measure that we can apply to terms in a document and then use to form a vector, whereas word2vec will produce a vector for a term and then require additional work to convert that set of vectors into a singular vector.

### Similarity Techniques

#### Evaluate the terms similarity using the Levenshtein distance

In [95]:
#import and install libraries
#pip install fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

ModuleNotFoundError: ignored

In [None]:
for i in Bigram_Token_Title[0:100]:
  for words in i:
    ratio = fuzz.ratio(words[0],words[1])
    print("levenstein ratio between --|{0}|-- and --|{1}|-- is ----- {2} ".format(words[0],words[1],ratio))

for i in Bigram_Token_Content[0:100]:
  for words in i:
    ratio = fuzz.ratio(words[0],words[1])
    print("levenstein ratio between --|{0}|-- and --|{1}|-- is ----- {2} ".format(words[0],words[1],ratio))

for i in Bigram_Token_Tags[0:100]:
  for words in i:
    ratio = fuzz.ratio(words[0],words[1])
    print("levenstein ratio between --|{0}|-- and --|{1}|-- is ----- {2} ".format(words[0],words[1],ratio))

### Patterns/Rules Examples