# **Installation and Import Packages**

In [3]:
!pip install newspaper3k



In [4]:
!pip install word2number



In [5]:
from newspaper import Article
import spacy
from spacy.matcher import Matcher
import pandas as pd
import spacy
from spacy import displacy
import re
from word2number import w2n
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import nltk


In [6]:
nltk.download('punkt')
nlp = spacy.load('en_core_web_sm') # for ner


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Read BD location dictionary**

In [7]:
url = 'https://raw.githubusercontent.com/nurshatfateh/WebScraping-NaturalLanguageProcessing/main/dictionary.csv'
df = pd.read_csv(url,header=None)
df.head()
bd_dist_name = df.iloc[:,2].tolist()
bd_dist_name =list(set(bd_dist_name))  #remove duplicate name

# **Helper Function**

In [8]:
#Extract sentences from new article - look for sentence containig targe word list
def accident_related_sentences(text):
  target_words=['accident', 'die','collide','crash','kill','injure','wound','collision','plunge'] #--- can be changed/modified the list
  doc = nlp(text)
  result=[]
  for sentence in doc.sents:
        lemmatized_words = [token.lemma_.lower() for token in sentence]
        if any(lemma in lemmatized_words for lemma in target_words):
            result.append(sentence.text.lower())   # --- converted to lower case
  return result

In [9]:
# -- The funtion take accident related text and serh
def find_accident_loc(result, fulltext):
  accLocation=[]
  #location find by dictionary
  for sentence in result:
    words = word_tokenize(sentence) #[token.text for token in str(sentence)]
    for place in words:
      if place in bd_dist_name:
        accLocation.append(place)

    # location found by Name Entitiy recognition tag
  nerLoc=[]
  nerSentence= nlp(fulltext)
  for ner in nerSentence.ents:
    if(ner.label_=="GPE"):
      nerLoc.append(ner)

  return list(set(accLocation)), list(set(nerLoc))

In [10]:
# extract Age from text- required to as death/injur numbe funtion sometimes detect as target number.
# so in tht case age need to be removed from death/injure list
'''def find_age(text):
  age_=[]
  for sentence in text:
      age = re.findall(r'Age[\:\s](\d{1,3})', sentence)
      age.extend(re.findall(r' (\d{1,3}),? ', sentence))
      if len(age) == 0:
        age = re.findall(r'\((\d{1,3})\)', sentence)
      age_.append(age)
      ageList=[int(item) for sublist in age_ for item in sublist]
  return list(set(ageList))'''

"def find_age(text):\n  age_=[]\n  for sentence in text:\n      age = re.findall(r'Age[\\:\\s](\\d{1,3})', sentence)\n      age.extend(re.findall(r' (\\d{1,3}),? ', sentence))\n      if len(age) == 0:\n        age = re.findall(r'\\((\\d{1,3})\\)', sentence)\n      age_.append(age)\n      ageList=[int(item) for sublist in age_ for item in sublist]\n  return list(set(ageList))"

In [11]:
def removeAgeinfo(text):
    age_pattern = r'(\b(?:[A-Z][a-z]*\s*){1,3}),\s*(\d{1,3})\b'
    age_matches = re.findall(age_pattern, text)
    for name, age in age_matches:
      #print(f"Name: {name}, Age: {age}")
      text = re.sub(re.escape(f"{name}, {age}"), "", text)

    #print(text)
    return text


In [20]:
def find_death_injure_number(sentence):
  death_,injure_ = [],[]
  string= ""
  # print(sentence)
  for text in sentence:

    #covert all word number to numeric form
    numeric_word=[]
    text_num= nlp(text)
    for word in text_num.ents:
      if(word.label_=="CARDINAL"):
        if ' ' in word.text:
          templist= word.text.split();
          #print(templist)
          for w in templist:
            if(w.isnumeric()!=1):
              numeric_word.append(w)

        else:
          if(word.text.isnumeric()!=1):
            numeric_word.append(word.text)

    if len(numeric_word)>0:
      for numword in numeric_word:
        result= text.find(numword)              #give first index position
        prepart=text[:result]                   #split the 1st part of string before numeric word
        nextpart=text[result+len(numword):]     # split the last part of string after numeric word

        # convert numeric word to int number
        try:
          res = w2n.word_to_num(text[result:result+len(numword)])
        except Exception as e:
          continue
        string= prepart+str(res)+nextpart       # concate the separated string part
    else:
      string= text

    # find the closet number of the give word.
    wordList= ["kill","dead","die","injure","wound"]
    for keyword in wordList:
      expression1 = r"(?i)(?:\b"+keyword+"\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}"+keyword+")"
      expression2 = r"(?:(?<!\S)(\d+(?:,\d+)?)(?!\S)\D*\b"+keyword+"|\b"+keyword+"\D*(?<!\S)(\d+(?:,\d+)?)(?!\S))"
      res1= re.findall(expression1, string)
      res2= re.findall(expression2, string)
      output= [' '.join(temp) for temp in res1]+[''.join(temp) for temp in res2]

      res= list(set(output))
      if keyword != "injure" and keyword != "wound":
        death_.append(res)
      else:
        injure_.append(res)

    #clear duplicate and  death number matched with injure number
  #death=[float(s.replace(',', '')) for s in string_array]
  death=[float(item.replace(',', '')) for sublist in death_ for item in sublist]
  injure=[float(item.replace(',', '')) for sublist in injure_ for item in sublist]

  death=list(set(death))
  #print("death: ",death)
  injure=list(set(injure))
  #print("inj: ", injure)
  #death_final= [num for num in death if num not in injure]
  #print("final death: ",death_final)
  return death, injure

In [13]:
from datetime import datetime
from dateutil import parser

def convert24(str1):

    # Checking if last two elements of time
    # is AM and first two elements are 12
    if str1[-2:] == "AM" and str1[:2] == "12":
        return "00" + str1[2:-2]

    # remove the AM
    elif str1[-2:] == "AM":
        return str1[:-2]

    # Checking if last two elements of time
    # is PM and first two elements are 12
    elif str1[-2:] == "PM" and str1[:2] == "12":
        return str1[:-2]

    else:

        # add 12 to hours and remove PM
        return str(int(str1[:2]) + 12) + str1[2:8]


def find_time_of_accident(sentence):
    daypart=[]
    partoftheday=""
    formatted_time=""

    time12form=re.findall(r'\d?\d(?::\d\d)?\s*[ap]m', sentence)
    #print("time12-",time12form)
    doc = nlp(sentence)
    timelist=[]
    for word in doc.ents:
      if word.label_=="TIME":
        timelist.append(word.text)
    timelist= list(set(timelist+time12form))
   # print("time from ner-",timelist)
    for time in timelist:
      if any(char.isdigit() for char in time):
        try:
          parsed_time = parser.parse(time)
          if parsed_time.time():
            formatted_time = parsed_time.strftime("%I:%M %p")
        except ValueError:
          pass

      try:
        time24format= convert24(formatted_time)
        #print("converted-",time24format)
        #print(time24format)
        hr=int(time24format[:2])
        if hr>=5 and hr<12:
          partoftheday="Morning"
        elif hr>=12 and hr<18:
          partoftheday="Afternoon"
        elif hr>=18 and hr<20:
          partoftheday="Evening"
        elif hr>=20 and hr<=24 or hr>=0 and hr<5:
          partoftheday="Night"
        daypart.append(partoftheday)
      except ValueError:
          pass
    #print("daypart-", daypart)
    return list(set(time12form)), list(set(daypart))


In [14]:
def extract_vehicles(text):
    #  vehicle = ['truck','bus','car','bike','motorcycle','rickshaw','auto-rickshaw','tempo', 'pickup','train']
    vehicle_patterns = r"\b(?:bus|truck|car|motorbike|bike|rickshaw|cycle|bicycle|motorcycle|auto-rickshaw|tempo|pickup|train|honda)\b"
    extracted_vehicles = re.findall(vehicle_patterns, text, flags=re.IGNORECASE)
    return list(set(extracted_vehicles))

# **News Data Read**

In [15]:
news=pd.read_csv('newsarticles.csv')
news

Unnamed: 0.1,Unnamed: 0,date,link,title
0,179,2022-08-16,https://www.thedailystar.net/news/bangladesh/a...,5 lives lost to gross neglect
1,145,2022-08-29,https://www.dhakatribune.com/bangladesh/2022/0...,4 killed in road accidents across Bangladesh
2,182,2022-09-15,https://www.tbsnews.net/bangladesh/4-killed-10...,"4 killed, 10 injured in Chattogram road accident"
3,194,2022-10-03,https://www.tbsnews.net/bangladesh/road-crashe...,Road crashes claim 476 lives in September: Report
4,175,2022-10-08,https://www.newagebd.net/article/183133/11-peo...,11 people killed as bus catches fire after roa...
...,...,...,...,...
195,49,2023-07-28,https://www.thedailystar.net/business/economy/...,Vehicle insurance business going thru tough times
196,4,2023-07-28,https://bangladeshpost.net/posts/motorcyclist-...,Motorcyclist killed in Satkhira road accident
197,1,2023-07-28,https://www.thedailystar.net/news/bangladesh/a...,Road Safety Movement 2018: Infrastructure impr...
198,40,2023-07-29,https://www.daily-bangladesh.com/english/inter...,"6 killed, 20 injured as 2 buses collide in Ind..."


In [32]:
newsDate, accidentLocation, deathNumber, injuredNumber=[],[],[],[]
accTime, dayPart, vechileList, acciddentType=[],[],[],[]
newsUrl, newsText=[],[]

for i in range(len(news)): #range(len(news)):   #49 -test for age  # 13 te problem
  try:
    article = Article(news['link'][i])
    article.download()
    article.parse()


    print(i, "--", news['title'][i])
    newsText.append(article.text)   #add in a column
    newsDate.append(news['date'][i])  # add as column
    newsUrl.append(news['link'][i])

    textWithTitle= news['title'][i]+". "+ article.text #added title with the text
    clearText=removeAgeinfo(textWithTitle)
    related_text=accident_related_sentences(clearText) #to find location
    related_text

    AccicdentLoc, NERloc= find_accident_loc(related_text, textWithTitle)
    print("1. Location: ",AccicdentLoc)
    accidentLocation.append(AccicdentLoc)

    death_num, injure_num= find_death_injure_number(related_text)
    print("2. DeathNo: ",death_num)
    print("3. Injurred: ",injure_num)
    injuredNumber.append(injure_num)
    if len(death_num)==0:
      deathNumber.append(1)
    else:
      deathNumber.append(death_num)

    time, partOfDay= find_time_of_accident(article.text)
    print("4. Time: ",time)
    print("5. Daypart: ",partOfDay)
    accTime.append(time)
    dayPart.append(partOfDay)

    extracted_vehicles = extract_vehicles(article.text)
    print("6. Vehicle: ",extracted_vehicles)
    vechileList.append(extracted_vehicles)

    if len(extracted_vehicles)==0:
      print("7. AccidentType: N/A")
      acciddentType.append("N/A")
    elif len(extracted_vehicles)==1:
      print("7. AccidentType: vehicle-human")
      acciddentType.append("vehicle-human")
    else:
      print("7. AccidentType: vehicle-vehicle")
      acciddentType.append("vehicle-vehicle")
    print('\n')
  except Exception as e:
    continue



0 -- 5 lives lost to gross neglect
1. Location:  ['uttara', 'islampur', 'dhaka', 'gazipur', 'jamalpur']
2. DeathNo:  [2.0, 12.0, 70.0, 15.0]
3. Injurred:  [3.0, 12.0]
4. Time:  ['4:00pm']
5. Daypart:  ['Afternoon']
6. Vehicle:  ['Bus', 'bus', 'car']
7. AccidentType: vehicle-vehicle


2 -- 4 killed, 10 injured in Chattogram road accident
1. Location:  ['palash', 'chattogram']
2. DeathNo:  [10.0, 4.0]
3. Injurred:  [10.0, 2.0]
4. Time:  ['1:30am']
5. Daypart:  ['Night']
6. Vehicle:  ['bus']
7. AccidentType: vehicle-human


4 -- 11 people killed as bus catches fire after road accident in India
1. Location:  []
2. DeathNo:  [11.0]
3. Injurred:  []
4. Time:  ['5:15am']
5. Daypart:  ['Morning']
6. Vehicle:  ['bus', 'truck']
7. AccidentType: vehicle-vehicle


5 -- Beyond awareness: A new decade of action on road safety
1. Location:  []
2. DeathNo:  [2.0]
3. Injurred:  []
4. Time:  []
5. Daypart:  []
6. Vehicle:  ['bus']
7. AccidentType: vehicle-human


6 -- Youth killed in Narail road acciden

In [70]:
len(injuredNumber)   #deathNumber

19

In [33]:
news_feature = pd.DataFrame(
    {'Date': newsDate, 'Location': accidentLocation,'Death Number': deathNumber,
     'Injured Number': injuredNumber,'Accident Time':  accTime, 'Part of Day': dayPart,
     'Accident Vechile': vechileList, 'Accident Type': acciddentType,
     'News Text': newsText, 'News Link': newsUrl
    })

news_feature

Unnamed: 0,Date,Location,Death Number,Injured Number,Accident Time,Part of Day,Accident Vechile,Accident Type,News Text,News Link
0,2022-08-16,"[uttara, islampur, dhaka, gazipur, jamalpur]","[2.0, 12.0, 70.0, 15.0]","[3.0, 12.0]",[4:00pm],[Afternoon],"[Bus, bus, car]",vehicle-vehicle,Crane drops heavy girder on car in Uttara; new...,https://www.thedailystar.net/news/bangladesh/a...
1,2022-09-15,"[palash, chattogram]","[10.0, 4.0]","[10.0, 2.0]",[1:30am],[Night],[bus],vehicle-human,"Four people were killed and 10 others, includi...",https://www.tbsnews.net/bangladesh/4-killed-10...
2,2022-10-08,[],[11.0],[],[5:15am],[Morning],"[bus, truck]",vehicle-vehicle,This undated UNB photo shows the debris of a c...,https://www.newagebd.net/article/183133/11-peo...
3,2022-10-26,[],[2.0],[],[],[],[bus],vehicle-human,Road traffic injuries (RTIs) are among the lea...,https://www.thedailystar.net/opinion/views/new...
4,2022-10-27,"[narail, dhaka, tularampur]",1,[],[8:00am],[Morning],"[car, motorcycle]",vehicle-vehicle,A youth has been killed on Thursday in a road ...,https://www.newagebd.net/article/184806/youth-...
...,...,...,...,...,...,...,...,...,...,...
173,2023-07-28,[],1,[],[],[],"[motorcycle, car]",vehicle-vehicle,People not interested amid absence of legal ob...,https://www.thedailystar.net/business/economy/...
174,2023-07-28,"[satkhira, kalaroa]",[8.0],[],[8am],[Morning],"[truck, motorcycle]",vehicle-vehicle,A motorcyclist was killed in a road accident o...,https://bangladeshpost.net/posts/motorcyclist-...
175,2023-07-28,[],[2.0],[],[],[],[bus],vehicle-human,Roads still unsafe despite infrastructure deve...,https://www.thedailystar.net/news/bangladesh/a...
176,2023-07-29,[],[6.0],[20.0],[],[],[],,,https://www.daily-bangladesh.com/english/inter...


In [36]:
news_feature.to_csv('news_feature.csv', sep=',', index=False, encoding='utf-8')


In [37]:
from google.colab import drive
drive.mount('/content/gdrive')
news_feature.to_csv('/content/gdrive/My Drive/news_feature.csv', index=False, encoding='utf-8')

Mounted at /content/gdrive


In [None]:
AccicdentLoc, NERloc= find_accident_loc(related_text, textWithTitle)
print(AccicdentLoc)
print(NERloc)
#age= find_age(clearText)
#print(age)

['sreepur', 'magura', 'faridpur', 'daulatpur', 'khulna']
[Magura, Khulna city, Chalna, Khulna, Khulna, Magura, Parchalna, Khulna, Magura, Parchalna]


In [None]:
#related_text.append(news['title'][i]) #title is needed to find death/injure number
death_num, injure_num= find_death_injure_number(related_text)
if len(death_num)==0:
  death_num.aeppen(1)
print(death_num)
print(injure_num)

death:  [2, 3]
inj:  []
[2, 3]
[]


In [None]:
time, partOfDay= find_time_of_accident(article.text)
print(time)
print(partOfDay)

time from ner- ['11:00am', '9:00am', 'morning']
['9:00am', '11:00am']
['Morning']


In [None]:
extracted_vehicles = extract_vehicles(article.text)
print(list(set(extracted_vehicles)))

['motorcycle']


***NOT required-- for testing ***