In [1]:
%%capture
!pip -q install langchain tiktoken chromadb pypdf sentence_transformers InstructorEmbedding faiss-cpu

In [2]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
import pickle
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
from google.colab import drive

  from tqdm.autonotebook import trange


### Load Multiple files from Directory

In [3]:
# connect your Google Drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive"

Mounted at /content/gdrive


In [4]:
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader(f'{root_dir}/Documents/', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [5]:
documents

[Document(page_content='Crime Prediction Using Twitter  Sentiment and Weather  Xinyu Chen, Youngwoon Cho, and Suk young Jang University of Virginia, xc7xn, yc5ac, sj2rh@virginia.edu   Abstract - Social networking services have the hidden potential to reveal valuable insights when statistical analysis is applied to their unstructured data. As shown by previous research, GPS-tagged Twitter data enables the prediction of future crimes in a major city, Chicago, Illinois, of the United States. However, existing crime prediction models that incorporate data from Twitter have limitations in describing criminal incidents due to the absence of sentiment polarity and weather factors. The addition of sentiment analysis and weather predictors to such models would deliver significant insight about how crime. Our aim is to predict the time and location in which a specific type of crime will occur. Our approach is based on sentiment analysis by applying lexicon-based methods and understanding of cate

In [6]:
unique_sources = set()

for doc in documents:
    source = doc.metadata['source']
    unique_sources.add(source)

num_unique_sources = len(unique_sources)

print("Number of unique sources:", num_unique_sources)

print()

print("Unique source names:")
for source_name in unique_sources:
    print(source_name)

Number of unique sources: 3

Unique source names:
/content/gdrive/My Drive/Documents/2022_Crime_Prediction_and_Monitoring_in_Porto_Portugal_Using_Machine_Learning_Spatial_and_Text_Analytics_5.pdf
/content/gdrive/My Drive/Documents/2015_Crime_prediction_using_twitter_sentiment_and_weather_11.pdf
/content/gdrive/My Drive/Documents/2022_Crime_prediction_using_a_hybrid_sentiment_analysis_approach_10.pdf


### Divide and Conquer

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
                                               chunk_size=1000, 
                                               chunk_overlap=200)

texts = text_splitter.split_documents(documents)

In [8]:
texts[0]

Document(page_content='Crime Prediction Using Twitter  Sentiment and Weather  Xinyu Chen, Youngwoon Cho, and Suk young Jang University of Virginia, xc7xn, yc5ac, sj2rh@virginia.edu   Abstract - Social networking services have the hidden potential to reveal valuable insights when statistical analysis is applied to their unstructured data. As shown by previous research, GPS-tagged Twitter data enables the prediction of future crimes in a major city, Chicago, Illinois, of the United States. However, existing crime prediction models that incorporate data from Twitter have limitations in describing criminal incidents due to the absence of sentiment polarity and weather factors. The addition of sentiment analysis and weather predictors to such models would deliver significant insight about how crime. Our aim is to predict the time and location in which a specific type of crime will occur. Our approach is based on sentiment analysis by applying lexicon-based methods and understanding of categ

In [9]:
len(texts)

199

### Get Embeddings for the Documents

In [10]:
def store_embeddings(docs, embeddings, sotre_name, path):
    
    vectorStore = FAISS.from_documents(docs, embeddings)

    with open(f"{path}/faiss_{sotre_name}.pkl", "wb") as f:
        pickle.dump(vectorStore, f)

In [11]:
def load_embeddings(sotre_name, path):
    with open(f"{path}/faiss_{sotre_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

### HuggingFace Instructor Embeddings

In [12]:
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "cuda"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [13]:
Embedding_store_path = f"{root_dir}/Embedding_store"

In [14]:
db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)

In [15]:
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 3})

In [16]:
retriever.search_type

'similarity'

In [17]:
retriever.search_kwargs

{'k': 3}

In [18]:
%%time
docs = retriever.get_relevant_documents("Who are the authors of 2015 crime prediction using twitter sentiment and weather report?")

CPU times: user 42 ms, sys: 898 µs, total: 42.9 ms
Wall time: 64.2 ms


In [19]:
docs[0]

Document(page_content='Crime Prediction Using Twitter  Sentiment and Weather  Xinyu Chen, Youngwoon Cho, and Suk young Jang University of Virginia, xc7xn, yc5ac, sj2rh@virginia.edu   Abstract - Social networking services have the hidden potential to reveal valuable insights when statistical analysis is applied to their unstructured data. As shown by previous research, GPS-tagged Twitter data enables the prediction of future crimes in a major city, Chicago, Illinois, of the United States. However, existing crime prediction models that incorporate data from Twitter have limitations in describing criminal incidents due to the absence of sentiment polarity and weather factors. The addition of sentiment analysis and weather predictors to such models would deliver significant insight about how crime. Our aim is to predict the time and location in which a specific type of crime will occur. Our approach is based on sentiment analysis by applying lexicon-based methods and understanding of categ

In [20]:
content = docs[0].page_content

metadata = docs[0].metadata

source = docs[0].metadata['source']

page_number = docs[0].metadata['page']

print("Page content:", content)
print()

print("Metadata:", metadata)
print()

print("Source:", source)
print()

print("Page:", page_number)

Page content: Crime Prediction Using Twitter  Sentiment and Weather  Xinyu Chen, Youngwoon Cho, and Suk young Jang University of Virginia, xc7xn, yc5ac, sj2rh@virginia.edu   Abstract - Social networking services have the hidden potential to reveal valuable insights when statistical analysis is applied to their unstructured data. As shown by previous research, GPS-tagged Twitter data enables the prediction of future crimes in a major city, Chicago, Illinois, of the United States. However, existing crime prediction models that incorporate data from Twitter have limitations in describing criminal incidents due to the absence of sentiment polarity and weather factors. The addition of sentiment analysis and weather predictors to such models would deliver significant insight about how crime. Our aim is to predict the time and location in which a specific type of crime will occur. Our approach is based on sentiment analysis by applying lexicon-based methods and understanding of categorized we

In [21]:
%%time
docs = retriever.get_relevant_documents("information regarding kde for porto on street segments")

CPU times: user 38.7 ms, sys: 972 µs, total: 39.7 ms
Wall time: 41.1 ms


In [22]:
content = docs[0].page_content

metadata = docs[0].metadata

source = docs[0].metadata['source']

page_number = docs[0].metadata['page']

print("Page content:", content)
print()

print("Metadata:", metadata)
print()

print("Page:", page_number)

Page content: Figure 1. Porto’s registered crime occurrence between 2016 and 2018: ( a) by hour; ( b) by month and
day (source: authors, based on data reports of Porto’s Public Safety Police).
4.2. Spatial and Temporal Pattern
Figure 2 shows a KDE for Porto, based on the values of street segments. The Law
of Crime Concentration is conﬁrmed, as speciﬁc segments and areas of the city are more
prone to criminal occurrences than others. This happens particularly in the downtown
area (the greatest concentration) in and around the main pedestrian/shopping street of
the city, Santa Catarina Street, and the main square where the City Hall is located (Aliados
Avenue), both close to the city’s nighttime district. Elsewhere, noticeable concentrations
also occur on the northern edge of the city, where the largest university campus and the
city’s main hospital are located, and in other main avenues as Boavista Avenue (to the city’s

Metadata: {'source': '/content/gdrive/My Drive/Documents/2022_Crim

In [23]:
%%time
docs = retriever.get_relevant_documents("enabling gfs tagged Twitter data for prediction of crimes in metro city")

CPU times: user 38.7 ms, sys: 1.87 ms, total: 40.6 ms
Wall time: 41.8 ms


In [24]:
content = docs[0].page_content

metadata = docs[0].metadata

source = docs[0].metadata['source']

page_number = docs[0].metadata['page']

print("Page content:", content)
print()

print("Metadata:", metadata)
print()

print("Page:", page_number)

Page content: prediction on future crime incidents. DATA COLLECTION The primary data source that we used for making a crime predictive model was Twitter data. We utilized tweets with GPS coordinates, which were generated within the Chicago city boundary from January 1st, 2014 to Jan 31st, 2014 (n=1069804). The twitter posts we used came from the official Twitter streaming API, bounded with coordinates [-87.94, 41.64] (South-West limit) and [-87.52, 42.02] (North-East limit) [9]. Figure 1 shows a kernel density estimation plot for tweets generated within the Chicago city boundary during the time period. In addition to Twitter data, Chicago criminal incidents data shows the historical trends of theft incidents occurred in Chicago. This data originated from the Chicago data portal website, which was developed by the Chicago Police Department by tracking theft incidents committed on spatial points indicated with specific latitude and longitude, and the time of the theft incidents [10]. The

In [25]:
%%time
docs = retriever.get_relevant_documents("rgearding christchurch mosque attack series")

CPU times: user 39 ms, sys: 1.89 ms, total: 40.9 ms
Wall time: 44.7 ms


In [26]:
content = docs[0].page_content

metadata = docs[0].metadata

source = docs[0].metadata['source']

page_number = docs[0].metadata['page']

print("Page content:", content)
print()

print("Metadata:", metadata)
print()

print("Page:", page_number)

Page content:     ISSN : 2502 -4752  
Indonesian J Elec Eng & Comp Sci , Vol. 25, No. 2, February 2022: 1131 -1139  1134  
days for each of the incidents, one day before, one day after, and the day in q uestion, all using the city's name 
as a keyword.  
− Christchurch Mosque attacks are a series of far -right terrorist attacks committed on March 15, 2019, by 
Brenton Tarrant against two mosques in the city of Christchurch, New Zealand, which left 51 dead and 
49 injured [16]. It is the deadliest massacre to have occurred in peacetime in New Zealand since that of 
Boyd in 1809 (66 to 70 dead).  
− El Paso shooting took place on August 3, 2019, in El Paso, Texas [17]. Twenty -three people are killed, 
at least twenty -three others are injured. The killer was arrested after surrendering to the police, says he 
wanted to kill as many Mexicans as possible. Racist hate crime is the hypothesis favored by a survey.

Metadata: {'source': '/content/gdrive/My Drive/Documents/2022_Crime_predicti