### Importing Libraries

In [26]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [27]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Loading Dataset

In [28]:
dataframe = pd.read_csv("./train.csv")

## **Lab - 1**


### Task - 1 (SENTENCE TOKENIZER)

In [29]:
result = []

In [30]:
for para in dataframe['text']:
	result.append(nltk.sent_tokenize(str(para)))
result

[['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'],
 ['Forest fire near La Ronge Sask.', 'Canada'],
 ["All residents asked to 'shelter in place' are being notified by officers.",
  'No other evacuation or shelter in place orders are expected'],
 ['13,000 people receive #wildfires evacuation orders in California'],
 ['Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school'],
 ['#RockyFire Update => California Hwy.',
  '20 closed in both directions due to Lake County fire - #CAfire #wildfires'],
 ['#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas'],
 ["I'm on top of the hill and I can see a fire in the woods..."],
 ["There's an emergency evacuation happening now in the building across the street"],
 ["I'm afraid that the tornado is coming to our area..."],
 ['Three people died from the heat wave so far'],
 ['Haha South Tampa is getting flooded hah- WAIT A SECOND I LIVE IN SOUTH TAMPA W

### Task - 2 (WORD TOKENIZER)


In [31]:
result = []

In [32]:
for para in dataframe["text"]:
	result.append(nltk.word_tokenize(str(para)))
result

[['Our',
  'Deeds',
  'are',
  'the',
  'Reason',
  'of',
  'this',
  '#',
  'earthquake',
  'May',
  'ALLAH',
  'Forgive',
  'us',
  'all'],
 ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask', '.', 'Canada'],
 ['All',
  'residents',
  'asked',
  'to',
  "'shelter",
  'in',
  'place',
  "'",
  'are',
  'being',
  'notified',
  'by',
  'officers',
  '.',
  'No',
  'other',
  'evacuation',
  'or',
  'shelter',
  'in',
  'place',
  'orders',
  'are',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'in',
  'California'],
 ['Just',
  'got',
  'sent',
  'this',
  'photo',
  'from',
  'Ruby',
  '#',
  'Alaska',
  'as',
  'smoke',
  'from',
  '#',
  'wildfires',
  'pours',
  'into',
  'a',
  'school'],
 ['#',
  'RockyFire',
  'Update',
  '=',
  '>',
  'California',
  'Hwy',
  '.',
  '20',
  'closed',
  'in',
  'both',
  'directions',
  'due',
  'to',
  'Lake',
  'County',
  'fire',
  '-',
  '#',
  'CAfire',
  '#',
  'wildfires'],
 ['#',
  

### Task - 3 (REMOVAL OF STOP WORDS)


In [33]:
corpusWithoutStopWords = []
stop_words = set(stopwords.words('english'))

In [34]:
def removeStopWords(para, tweetId):
	words = nltk.word_tokenize(para)
	for i in words:
		if i not in stop_words:
			corpusWithoutStopWords.append((i,tweetId))

In [35]:
for i in range(len(dataframe["text"])):
	removeStopWords(str(dataframe["text"][i]), (i+1))

In [36]:
# corpusWithoutStopWords

### Task - 4 (STEMMING)

In [37]:
ps = PorterStemmer()
stemmedCorpus = []

In [38]:
def stem(word):
	stemmedWord = nltk.word_tokenize(str(word[0]))
	return (stemmedWord, word[1])

In [39]:
for w in corpusWithoutStopWords:
	stemmedCorpus.append(stem(w))
stemmedCorpus

[(['Our'], 1),
 (['Deeds'], 1),
 (['Reason'], 1),
 (['#'], 1),
 (['earthquake'], 1),
 (['May'], 1),
 (['ALLAH'], 1),
 (['Forgive'], 1),
 (['us'], 1),
 (['Forest'], 2),
 (['fire'], 2),
 (['near'], 2),
 (['La'], 2),
 (['Ronge'], 2),
 (['Sask'], 2),
 (['.'], 2),
 (['Canada'], 2),
 (['All'], 3),
 (['residents'], 3),
 (['asked'], 3),
 (["'shelter"], 3),
 (['place'], 3),
 (["'"], 3),
 (['notified'], 3),
 (['officers'], 3),
 (['.'], 3),
 (['No'], 3),
 (['evacuation'], 3),
 (['shelter'], 3),
 (['place'], 3),
 (['orders'], 3),
 (['expected'], 3),
 (['13,000'], 4),
 (['people'], 4),
 (['receive'], 4),
 (['#'], 4),
 (['wildfires'], 4),
 (['evacuation'], 4),
 (['orders'], 4),
 (['California'], 4),
 (['Just'], 5),
 (['got'], 5),
 (['sent'], 5),
 (['photo'], 5),
 (['Ruby'], 5),
 (['#'], 5),
 (['Alaska'], 5),
 (['smoke'], 5),
 (['#'], 5),
 (['wildfires'], 5),
 (['pours'], 5),
 (['school'], 5),
 (['#'], 6),
 (['RockyFire'], 6),
 (['Update'], 6),
 (['='], 6),
 (['>'], 6),
 (['California'], 6),
 (['Hwy'

### **Cleaning the Stem Corpus**

In [40]:
cleanCorpus = []
for i in stemmedCorpus:
  if not (ord(i[0][0][0]) <= 64 or 91 <= ord(i[0][0][0]) <= 96):
    cleanCorpus.append(i)

## **Lab - 2**

### Task - 1 (INVERTED INDEX)

In [41]:
invertedIndex = dict()

In [42]:
def insertIntoInvertedIndex(word):
  if word[0][0] in invertedIndex:
    invertedIndex[word[0][0]].append(word[1])
  else:
    invertedIndex[word[0][0]] = [word[1]]

In [43]:
for i in cleanCorpus:
    insertIntoInvertedIndex(i)
invertedIndex

{'Our': [1,
  1372,
  1551,
  1646,
  1674,
  2221,
  2432,
  2856,
  2977,
  3100,
  3125,
  3173,
  3236,
  3346,
  3370,
  3619,
  3787,
  4025,
  4210,
  4282,
  4631,
  4660,
  4988,
  6568,
  7158],
 'Deeds': [1, 4986],
 'Reason': [1, 305, 306, 318, 320],
 'earthquake': [1,
  3029,
  3029,
  3030,
  3031,
  3039,
  3039,
  3042,
  3044,
  3045,
  3047,
  3048,
  3049,
  3052,
  3055,
  3062,
  3063,
  3066,
  5738,
  6025,
  6973,
  7132,
  7132,
  7136,
  7590],
 'May': [1,
  939,
  1476,
  1833,
  1935,
  2015,
  2794,
  4011,
  4139,
  4178,
  4256,
  5157,
  5472,
  5853,
  6533,
  6647],
 'ALLAH': [1],
 'Forgive': [1],
 'us': [1,
  43,
  284,
  682,
  830,
  856,
  1158,
  1221,
  1304,
  1439,
  1486,
  1632,
  1633,
  1646,
  1675,
  1699,
  1699,
  1741,
  1983,
  2056,
  2059,
  2224,
  2274,
  2373,
  2384,
  2560,
  2633,
  2892,
  2892,
  3084,
  3096,
  3574,
  3634,
  3834,
  3860,
  3979,
  4177,
  4230,
  4232,
  4258,
  4436,
  4582,
  4636,
  4670,
  4782,
  505

### Task - 2 (MERGE ALGORITHM)

In [44]:
def merge(posting1,posting2):
  i, j = 0, 0
  mergedList = []
  while(i < len(posting1) and j < len(posting2)):
    if posting1[i] == posting2[j]:
      mergedList.append(posting1[i])
      i += 1
      j += 1
    elif posting1[i] < posting2[j]:
      i += 1
    else:
      j += 1
  return mergedList

In [45]:
query_1 = input("Enter Query 1: ")
query_2 = input("Enter Query 2: ")
result = []
try:
  result = merge(invertedIndex[query_1],invertedIndex[query_2])
except:
  pass

Enter Query 1: range
Enter Query 2: rover


In [46]:
result

[80]