# Text Pre Processing and Vectorisation

## Importing Libraries

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime

import re
from nltk.corpus import stopwords

In [2]:
import wget
import unzip

In [6]:
wget.download('http://nlp.stanford.edu/data/glove.6B.zip')


'glove.6B.zip'

In [7]:
import zipfile

In [8]:
with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('glove')
    

## Loading the Sentence PDF Files

In [135]:
df_sentences_t = pd.read_csv('ParagraphToSentencesByNLTKSentTokenizer.csv')

In [136]:
df_sentences_m = pd.read_csv('ParagraphToSentencesByMe.csv')

In [181]:
df_sentences_d = pd.read_csv('ParagraphToSentencesByDotSpace.csv')

## Basic Exploration and Visualization

In [137]:
df_sentences_m.head()

Unnamed: 0.1,Unnamed: 0,Sentences
0,0,Dubai Building Code 2021 Edition Conte...
1,1,1 A.
2,2,2 A.
3,3,3 A.
4,4,4 A.


In [138]:
del df_sentences_m['Unnamed: 0']
df_sentences_m

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,1 A.
2,2 A.
3,3 A.
4,4 A.
...,...
23092,The fire alarm control panel or the monito...
23093,Each villa/townhouse shall be provided wit...
23094,K 171 Dubai Building Code Part K: Vi...
23095,11 Security The common areas of villa or to...


In [139]:
del df_sentences_t['Unnamed: 0']
df_sentences_t

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,The content of the DBC is based on the followi...
2,The DBC is arranged under themes to integrate ...
3,The regulations stipulated in the DBC are the ...
4,Nothing in the DBC prevents a building design ...
...,...
11648,8 of UAE FLSC [Ref.
11649,K.1].
11650,The fire alarm control panel or the monitoring...
11651,Each villa/townhouse shall be provided with au...


In [182]:
df_sentences_d.head()

Unnamed: 0.1,Unnamed: 0,Sentences
0,0,Dubai Building Code 2021 Edition Conte...
1,1,The content of the DBC is based on the followi...
2,2,The DBC is arranged under themes to integrate ...
3,3,The regulations stipulated in the DBC are the ...
4,4,Nothing in the DBC prevents a building design ...


In [183]:
del df_sentences_d['Unnamed: 0']
df_sentences_d

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,The content of the DBC is based on the followi...
2,The DBC is arranged under themes to integrate ...
3,The regulations stipulated in the DBC are the ...
4,Nothing in the DBC prevents a building design ...
...,...
11784,K.1]
11785,The fire alarm control panel or the monitoring...
11786,Each villa/townhouse shall be provided with au...
11787,K 171 Dubai Building Code Part K: Villas...


In [140]:
df_sentences_m.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23097 entries, 0 to 23096
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentences  23097 non-null  object
dtypes: object(1)
memory usage: 180.6+ KB


In [141]:
df_sentences_m.describe()

Unnamed: 0,Sentences
count,23097.0
unique,14149.0
top,4.0
freq,548.0


In [142]:
df_sentences_t.describe()

Unnamed: 0,Sentences
count,11653
unique,9978
top,Ref.
freq,181


In [184]:
df_sentences_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11789 entries, 0 to 11788
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentences  11783 non-null  object
dtypes: object(1)
memory usage: 92.2+ KB


In [185]:
df_sentences_d.describe()

Unnamed: 0,Sentences
count,11783
unique,10074
top,Ref
freq,185


In [143]:
df_sentences_m.drop_duplicates(subset ="Sentences",
                     keep = False, inplace = True)
df_sentences_m

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,1 A.
2,2 A.
3,3 A.
4,4 A.
...,...
23092,The fire alarm control panel or the monito...
23093,Each villa/townhouse shall be provided wit...
23094,K 171 Dubai Building Code Part K: Vi...
23095,11 Security The common areas of villa or to...


In [144]:
df_sentences_t.drop_duplicates(subset ="Sentences",
                     keep = False, inplace = True)
df_sentences_t

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,The content of the DBC is based on the followi...
2,The DBC is arranged under themes to integrate ...
3,The regulations stipulated in the DBC are the ...
4,Nothing in the DBC prevents a building design ...
...,...
11646,K.1] requires a smoke detection and alarm sys...
11647,The system shall be designed in accordance wit...
11650,The fire alarm control panel or the monitoring...
11651,Each villa/townhouse shall be provided with au...


In [186]:
df_sentences_d.drop_duplicates(subset ="Sentences",
                     keep = False, inplace = True)
df_sentences_d

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,The content of the DBC is based on the followi...
2,The DBC is arranged under themes to integrate ...
3,The regulations stipulated in the DBC are the ...
4,Nothing in the DBC prevents a building design ...
...,...
11782,The system shall be designed in accordance wit...
11785,The fire alarm control panel or the monitoring...
11786,Each villa/townhouse shall be provided with au...
11787,K 171 Dubai Building Code Part K: Villas...


In [145]:
df_sentences_t.describe()

Unnamed: 0,Sentences
count,9139
unique,9139
top,Dubai Building Code 2021 Edition Conte...
freq,1


In [146]:
df_sentences_m.describe()

Unnamed: 0,Sentences
count,12592
unique,12592
top,Dubai Building Code 2021 Edition Conte...
freq,1


In [187]:
df_sentences_d.describe()

Unnamed: 0,Sentences
count,9219
unique,9219
top,Dubai Building Code 2021 Edition Conte...
freq,1


In [147]:
df_sentences_m.head()

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,1 A.
2,2 A.
3,3 A.
4,4 A.


In [194]:
df_sentences_t.head()

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,The content of the DBC is based on the followi...
2,The DBC is arranged under themes to integrate ...
3,The regulations stipulated in the DBC are the ...
4,Nothing in the DBC prevents a building design ...


In [195]:
df_sentences_d.head()

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,The content of the DBC is based on the followi...
2,The DBC is arranged under themes to integrate ...
3,The regulations stipulated in the DBC are the ...
4,Nothing in the DBC prevents a building design ...


In [188]:
#Removing Extra Spaces from Sentences Column
df_sentences_m['Sentences'] = df_sentences_m['Sentences'].str.strip()

In [189]:
#Removing Extra Spaces between the words
df_sentences_m.Sentences = df_sentences_m.Sentences.replace(r'\s+', ' ', regex=True)

In [190]:
df_sentences_m.head()

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Contents Part...
1,1 A.
2,2 A.
3,3 A.
4,4 A.


In [191]:
df_sentences_m['Sentences'].str.len()

0        278
1          4
2          4
3          4
4          4
        ... 
23092    113
23093    133
23094     43
23095    124
23096     40
Name: Sentences, Length: 12592, dtype: int64

In [177]:
print (df_sentences_m.Sentences.str.len().sort_values().head())

11284    2
16161    2
11280    2
8477     2
8240     2
        ..
19284    8
17796    8
9372     8
4971     8
13532    8
Name: Sentences, Length: 600, dtype: int64


In [227]:
df_sentences_m[df_sentences_m["Sentences"].str.len()==4]

Unnamed: 0,Sentences
1,1 A.
2,2 A.
3,3 A.
4,4 A.
5,5 A.
...,...
21889,90).
22065,6 K.
22173,34].
22982,103.


In [228]:
df_sentences_m.loc[23041, 'Sentences']

'4 The fire alarm system in villas/townhouses shall be connected to DCD control centres via the Hassantuk for Homes system (available at: building.'

In [154]:
# Dropping the Sentence less than 5 characters long
# df_sentences_m = df_sentences_m[df_sentences_m["Sentences"].str.len()>5]

In [155]:
# df_sentences_m['Sentences'].str.strip()

In [170]:
df_m_left = df_sentences_m.style.apply(lambda x: ["text-align:left"]*len(x))

In [197]:
df_m_left

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Contents Part A Part B Part C Part D Part E General Architecture Accessibility Vertical transportation Building envelope Part F Part G Part H Part J Part K Structure Incoming utilities Indoor environment Security Villas ii Dubai Building Code A.
1,1 A.
2,2 A.
3,3 A.
4,4 A.
5,5 A.
6,6 A.
7,7 A.
8,8 A.
9,9 Introduction Definitions References Relationship of DBC to other local and international regulations Scope and application of the DBC Navigating the DBC Adoption of DBC Alternative solutions Maintenance of DBC A.


In [198]:
type(df_m_left)

pandas.io.formats.style.Styler

In [193]:
df_sentences_t.head(2)

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,The content of the DBC is based on the followi...
2,The DBC is arranged under themes to integrate ...
3,The regulations stipulated in the DBC are the ...
4,Nothing in the DBC prevents a building design ...


In [199]:
#Removing Extra Trailing and Leading Spaces from Snetences Column
df_sentences_t['Sentences'] = df_sentences_t['Sentences'].str.strip()

In [234]:
df_sentences_t["Sentences"].str.len()

0        1324
1         420
2          88
3          88
4         114
         ... 
11646      75
11647      63
11650     114
11651     134
11652     231
Name: Sentences, Length: 9139, dtype: int64

In [208]:
print(df_sentences_t.Sentences.str.len().sort_values().head(800))

5477      2
4029      3
6963      4
6955      4
10439     4
         ..
9905     40
4901     40
8862     41
1825     41
11108    41
Name: Sentences, Length: 800, dtype: int64


In [226]:
df_sentences_t[df_sentences_t["Sentences"].str.len()==6]

Unnamed: 0,Sentences
1098,B.37].
1106,B.38].
1122,B.39].
1210,B.40].
1310,B.41].
...,...
11263,K.35].
11272,K.36].
11350,K.42].
11364,K.38].


In [225]:
print(df_sentences_t.loc[10439, 'Sentences'])

K.2.


In [232]:
# for i in range(len(df_sentences_t['Sentences'])):
#     print(df_sentences_t['Sentences'][i], df_sentences_t['Sentences'].str.len()[i])
df_sentences_t['Char_Count'] = df_sentences_t['Sentences'].str.len()

In [233]:
df_sentences_t

Unnamed: 0,Sentences,Char_Count
0,Dubai Building Code 2021 Edition Conte...,1324
1,The content of the DBC is based on the followi...,420
2,The DBC is arranged under themes to integrate ...,88
3,The regulations stipulated in the DBC are the ...,88
4,Nothing in the DBC prevents a building design ...,114
...,...,...
11646,K.1] requires a smoke detection and alarm sys...,75
11647,The system shall be designed in accordance wit...,63
11650,The fire alarm control panel or the monitoring...,114
11651,Each villa/townhouse shall be provided with au...,134


In [235]:
#Removing Extra Spaces between the words
df_sentences_t.Sentences = df_sentences_t.Sentences.replace(r'\s+', ' ', regex=True)

In [236]:
df_sentences_t['Char_Count'] = df_sentences_t['Sentences'].str.len()
df_sentences_t

Unnamed: 0,Sentences,Char_Count
0,Dubai Building Code 2021 Edition Contents Part...,1141
1,The content of the DBC is based on the followi...,403
2,The DBC is arranged under themes to integrate ...,87
3,The regulations stipulated in the DBC are the ...,87
4,Nothing in the DBC prevents a building design ...,113
...,...,...
11646,K.1] requires a smoke detection and alarm syst...,74
11647,The system shall be designed in accordance wit...,62
11650,The fire alarm control panel or the monitoring...,113
11651,Each villa/townhouse shall be provided with au...,133


###  Processing on Dataframe Created from ParagraphToSentencesByDotSpace.csv

In [237]:
df_sentences_d.head()

Unnamed: 0,Sentences
0,Dubai Building Code 2021 Edition Conte...
1,The content of the DBC is based on the followi...
2,The DBC is arranged under themes to integrate ...
3,The regulations stipulated in the DBC are the ...
4,Nothing in the DBC prevents a building design ...


In [238]:
#Removing Extra Trailing and Leading Spaces from Snetences Column
df_sentences_d['Sentences'] = df_sentences_d['Sentences'].str.strip()

In [239]:
df_sentences_d["Sentences"].str.len()

0        1323
1         419
2          87
3          87
4         113
         ... 
11782      62
11785     113
11786     133
11787     180
11788      46
Name: Sentences, Length: 9219, dtype: int64

In [240]:
print(df_sentences_d.Sentences.str.len().sort_values().head(800))

8281     1
5521     1
5048     2
6010     2
4066     2
        ..
5038    38
2764    38
97      38
7425    38
6977    38
Name: Sentences, Length: 800, dtype: int64


In [241]:
df_sentences_d[df_sentences_d["Sentences"].str.len()==1]

Unnamed: 0,Sentences
5521,7
8281,9


In [246]:
print(df_sentences_d.loc[5049, 'Sentences'])

When landings are cast in-situ after placement of the precast stair flights, the stair  should be propped


In [None]:
# for i in range(len(df_sentences_t['Sentences'])):
#     print(df_sentences_t['Sentences'][i], df_sentences_t['Sentences'].str.len()[i])
df_sentences_d['Char_Count'] = df_sentences_d['Sentences'].str.len()
# Display
df_sentences_d

In [None]:
#Removing Extra Spaces between the words
df_sentences_d.Sentences = df_sentences_d.Sentences.replace(r'\s+', ' ', regex=True)

In [None]:
df_sentences_d['Char_Count'] = df_sentences_d['Sentences'].str.len()
df_sentences_d

In [158]:
df = pd.DataFrame({'team': ['Team 1', 'Team 1', 'Team 2',
                            'Team 3', 'Team 2', 'Team 3'],
                   'Subject': ['Math', 'Science', 'Science',
                               'Math', 'Science', 'Math'],
                   'points': [10, 8, 10, 6, 6, 5]})

In [159]:
df

Unnamed: 0,team,Subject,points
0,Team 1,Math,10
1,Team 1,Science,8
2,Team 2,Science,10
3,Team 3,Math,6
4,Team 2,Science,6
5,Team 3,Math,5


In [160]:
print (df.Subject.str.len().sort_values())

0    4
3    4
5    4
1    7
2    7
4    7
Name: Subject, dtype: int64


In [161]:
# # Dropping the team 1
# df = df[df["Subject"].str.len()>4]

In [162]:
# df

## Extracting Word Vectors

In [163]:
# Extract word vectors
word_embeddings = {}
f = open('./glove/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()