<a href="https://colab.research.google.com/github/suvasish114/Facility-Detection-and-Popularity-Assessment/blob/main/Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Facility Detection and Popularity Assessment

## 1. Data Collection

In [None]:
# Installing Libraries
%pip install numpy pandas matplotlib nltk sklearn

In [None]:
# Importing libraries
import re
import random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
lemmatizer = WordNetLemmatizer()

In [3]:
# Load datasets
dataset = pd.read_csv("twitter.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5145 entries, 0 to 5144
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  5145 non-null   int64  
 1   id          5145 non-null   int64  
 2   date        5145 non-null   object 
 3   username    5145 non-null   object 
 4   content     5145 non-null   object 
 5   hashtags    1206 non-null   object 
 6   likes       5145 non-null   int64  
 7   view        5141 non-null   float64
 8   longitude   5145 non-null   float64
 9   latitude    5145 non-null   float64
 10  place       5145 non-null   object 
 11  country     5142 non-null   object 
dtypes: float64(3), int64(3), object(6)
memory usage: 482.5+ KB


## 2. Data Cleaning and Sanitization

In [4]:
# size of the dataset
print(f"Size: {dataset.shape[0]}")
# check for missing values
print(f"Missing content present: {dataset['content'].isna().any()}")
print(f"Missing hashtags present: {dataset['hashtags'].isna().any()}")

Size: 5145
Missing content present: False
Missing hashtags present: True


In [5]:
# setup training data contains only english language
training_data = dataset[["content"]]
training_data.head()

Unnamed: 0,content
0,Update: We tracked our own air tags to the air...
1,Self check in at airports is not for all passp...
2,Stopover at 🇦🇪 before 🇮🇹❗️ (@ Dubai Internatio...
3,Yallll I realized I forgot my wallet once I go...
4,Checked out 😩 (@ Holiday Inn Express Sydney Ai...


In [6]:
# Data sanitation
def sanitation(content):
    ''' 1. Remove all URLs.
        2. Remove all non-ASCII characters, and numbers.
        3. Replace @ with 'at'.
        4. Replace _ with space.
        5. Lower case all characters. '''
    ascii = [32] + list(range(65,91)) + list(range(97,123))
    _s = content.split(" ")
    temp = []
    for a in _s:
        # remove URL
        if re.findall("^http",a): pass
        # remove non-ASCII characters and numbers
        else:
            temp1 = ""
            for b in a:
                if b == "@": temp1 += "at "
                elif b == "_": temp1 += " "
                elif ord(b) in ascii: temp1 += b
                # else: pass
            if len(temp1.strip()) > 0: temp.append(temp1.lower())
    return " ".join(temp).strip()

# sanitation("my favourite track is feel_good_inside by @gorilaz")

In [7]:
# Word tokenization
# removed non-ASCII characters
tokenize = []
for a in list(training_data["content"]):
    tokenize.append(word_tokenize(sanitation(a)))

In [8]:
# Removing stop words
stop_words = set(stopwords.words('english'))
stop_words.remove('at')
filtered_contents = []
for w in tokenize:
    temp = [a for a in w if a not in stop_words]
    filtered_contents.append(temp)

# free memeory
del tokenize

In [9]:
# Lemmatization
for i in range(len(filtered_contents)):
    for j in range(len(filtered_contents[i])):
        filtered_contents[i][j] = lemmatizer.lemmatize(filtered_contents[i][j])

In [28]:
# Creating training dataset
temp = []
for a in filtered_contents:
    temp.append([" ".join(a),random.randint(0,1)])
training_data = pd.DataFrame(temp, columns=["content","class"])
training_data.head()

Unnamed: 0,content,class
0,update tracked air tag airport man man finally...,0
1,self check at airport passport holder shaawon ...,1
2,stopover at at dubai international airport dubai,0
3,yallll realized forgot wallet got airport tsa ...,0
4,checked at holiday inn express sydney airport ...,0


In [27]:
# Vectorization
tfidfVectorizer = TfidfVectorizer()
tfidf_corpus = tfidfVectorizer.fit_transform(training_data["content"])

In [30]:
print(tfidf_corpus)

  (0, 3190)	0.20326458156353006
  (0, 6374)	0.259932916090755
  (0, 4884)	0.3551754072726634
  (0, 4445)	0.27210247658918724
  (0, 7544)	0.5097986169293055
  (0, 353)	0.04681640826638896
  (0, 12030)	0.3651615134188747
  (0, 312)	0.2568453514538075
  (0, 12560)	0.396181192362104
  (0, 12957)	0.2693894073341391
  (1, 4875)	0.4209763934415294
  (1, 191)	0.4209763934415294
  (1, 11012)	0.4209763934415294
  (1, 5558)	0.4016954085972346
  (1, 9173)	0.3251621943281612
  (1, 965)	0.07133228991674437
  (1, 2368)	0.2600392965869691
  (1, 10915)	0.35505427084213326
  (1, 353)	0.04974643694811551
  (2, 6124)	0.18793416244052272
  (2, 3723)	0.7901550937890729
  (2, 11721)	0.5460356680114836
  (2, 965)	0.1939279052827415
  (2, 353)	0.06762170626996408
  (3, 11138)	0.2792103104739601
  :	:
  (5143, 8496)	0.17236782349582538
  (5143, 6124)	0.09819942406237897
  (5143, 965)	0.1013312766613984
  (5143, 353)	0.0706674350590983
  (5144, 10825)	0.23000791827249412
  (5144, 4125)	0.23467312543797578
  (514

## 3. Classification

In [10]:
# Naive Bayas classification
len(filtered_contents)

5145

In [11]:
# Support Vector Matrix


## 4. Analysis

## 5. Future Scopre