<a href="https://colab.research.google.com/github/suvasish114/Facility-Detection-and-Popularity-Assessment/blob/main/Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Facility Detection and Popularity Assessment

## 1. Data Collection

In [None]:
# Installing Libraries
%pip install numpy pandas matplotlib nltk sklearn geopy googletrans==3.1.0a0

In [None]:
# Importing libraries
import re
import random
import numpy as np
import pandas as pd
import nltk
from geopy.geocoders import Nominatim
from googletrans import Translator
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Object initialization
nltk.download('all')
lemmatizer = WordNetLemmatizer()
translator = Translator()

In [None]:
# Load datasets
dataset = pd.read_csv("twitter.csv")
dataset.info()

## 2. Data Cleaning and Sanitization

In [9]:
# size of the dataset
print(f"Size: {dataset.shape[0]}")
# check for missing values
print(f"Missing content present: {dataset['content'].isna().any()}")
print(f"Missing hashtags present: {dataset['hashtags'].isna().any()}")

Size: 5145
Missing content present: False
Missing hashtags present: True


In [None]:
### This step will take more than 1 hour to execute. Use precalculated class_labeled dataset insted.
# Labeling each tweet based on their featured location
# User was at featured location: 1
# User was not at featured location: 0

geolocator = Nominatim(user_agent="getLocation")
class_label = []
i = 0
exceptions = dict()
for lat,lng in map(list, zip(dataset["latitude"],dataset["longitude"])):
    loc = ",".join([str(lat),str(lng)])
    address = None
    try:
        _address = geolocator.reverse(loc).address
        address = str(translator.translate(_address, dest="en").text).lower()
    except:
        exceptions.update({i:loc})
    finally:
        print(f"{i}: {address}")
        if address == None or "airport" not in address: class_label.append(0)
        else: class_label.append(1)
        i += 1

In [10]:
# Test
print(len(class_label))

5145


In [11]:
# setup training data contains only english language
training_data = dataset[["content"]]
training_data.head()

Unnamed: 0,content
0,Update: We tracked our own air tags to the air...
1,Self check in at airports is not for all passp...
2,Stopover at 🇦🇪 before 🇮🇹❗️ (@ Dubai Internatio...
3,Yallll I realized I forgot my wallet once I go...
4,Checked out 😩 (@ Holiday Inn Express Sydney Ai...


In [12]:
# Data sanitation
def sanitation(content):
    ''' 1. Remove all URLs.
        2. Remove all non-ASCII characters, and numbers.
        3. Replace @ with 'at'.
        4. Replace _ with space.
        5. Lower case all characters. '''
    ascii = [32] + list(range(65,91)) + list(range(97,123))
    _s = content.split(" ")
    temp = []
    for a in _s:
        # remove URL
        if re.findall("^http",a): pass
        # remove non-ASCII characters and numbers
        else:
            temp1 = ""
            for b in a:
                if b == "@": temp1 += "at "
                elif b == "_": temp1 += " "
                elif ord(b) in ascii: temp1 += b
                # else: pass
            if len(temp1.strip()) > 0: temp.append(temp1.lower())
    return " ".join(temp).strip()

# sanitation("my favourite track is feel_good_inside by @gorilaz")

In [13]:
# Word tokenization
# removed non-ASCII characters
tokenize = []
for a in list(training_data["content"]):
    tokenize.append(word_tokenize(sanitation(a)))

In [14]:
# Removing stop words
stop_words = set(stopwords.words('english'))
stop_words.remove('at')
filtered_contents = []
for w in tokenize:
    temp = [a for a in w if a not in stop_words]
    filtered_contents.append(temp)

# free memeory
del tokenize

In [15]:
# Lemmatization
for i in range(len(filtered_contents)):
    for j in range(len(filtered_contents[i])):
        filtered_contents[i][j] = lemmatizer.lemmatize(filtered_contents[i][j])

In [37]:
# Creating training dataset
temp = []
for i in range(dataset.shape[0]):
    temp.append([" ".join(filtered_contents[i]), class_label[i]])
df = pd.DataFrame(temp, columns=["content","class_label"])
df.groupby("class_label").count()

Unnamed: 0_level_0,content
class_label,Unnamed: 1_level_1
0,4471
1,674


In [38]:
# Spliting dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.content,df.class_label)
print(f"Training Data: {x_train.shape[0]}")
print(f"Test Data: {x_test.shape[0]}")

Training Data: 3858
Test Data: 1287


In [39]:
# Vectorization
tfidfVectorizer = TfidfVectorizer()
tfidf_corpus = tfidfVectorizer.fit_transform(x_train.values)
tfidf_corpus.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## 3. Classification

In [40]:
# Naive Bayes classification
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(tfidf_corpus,y_train)

In [57]:
# Prediction
x_test_count = tfidfVectorizer.transform(x_test.values[:-1])
for a in model.predict(x_test_count):
    print(a, end=" ")

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [None]:
# Support Vector Matrix


## 4. Analysis