# Named Entity Recognition Pipeline

El pipeline toma una URL de un feed en formato RSS, obtiene el título y descripción de los artículos en el feed, detecta las NER con un modelo pre-entrenado, y las muestra ordenadas por frecuencia de aparición.

### Versiones
Probado con:
* Almond 0.6.0
* Ammonite 1.6.7
* Scala library version **2.11.12** -- Copyright 2002-2017, LAMP/EPFL
* Java 1.8.0_282

Para ver más información ir a (Help -> About Scala Kernel)

## 1. Obtener texto

### 1.1 Importar librerías

In [1]:
// Equivalent of adding dependencies to maven or sbt files
// For example, to add "org.scalaj" %% "scalaj-http" % "2.4.2" 
import $ivy.`org.scalaj::scalaj-http:2.4.2`
// "org.scala-lang.modules" %% "scala-xml" % "1.3.0"
import $ivy.`org.scala-lang.modules::scala-xml:1.3.0`

[32mimport [39m[36m$ivy.$                              
// "org.scala-lang.modules" %% "scala-xml" % "1.3.0"
[39m
[32mimport [39m[36m$ivy.$                                        [39m

In [2]:
import scalaj.http.{Http, HttpResponse}
import scala.xml.XML

[32mimport [39m[36mscalaj.http.{Http, HttpResponse}
[39m
[32mimport [39m[36mscala.xml.XML[39m

In [3]:
//IMPORRTAR LIBRERIAS PARA JSON
import $ivy.`org.json4s::json4s-jackson:3.4.0`

[32mimport [39m[36m$ivy.$                                 [39m

In [4]:
import org.json4s.JsonDSL._
import org.json4s._
import org.json4s.jackson.JsonMethods._

[32mimport [39m[36morg.json4s.JsonDSL._
[39m
[32mimport [39m[36morg.json4s._
[39m
[32mimport [39m[36morg.json4s.jackson.JsonMethods._[39m

In [5]:
implicit val formats = DefaultFormats

[36mformats[39m: [32mDefaultFormats[39m.type = org.json4s.DefaultFormats$@6dc9f590

In [6]:
import scala.util.matching.Regex

[32mimport [39m[36mscala.util.matching.Regex[39m

### 1.1 Obtener el texto del RSS Feed

Realizamos una consulta HTTP, que nos devuelve una instancia de HTTPResponse. Dentro del atributo `body` de la HTTPResponse, se encuentra el texto del feed en formato XML. Luego, se parsea el XML para extraer los campos `title` y `description`.

In [6]:
// Tutorial https://alvinalexander.com/source-code/scala-how-to-http-download-xml-rss-feed-timeout/
// get the xml content using scalaj-http
val urlRss = "https://www.chicagotribune.com/arcio/rss/category/sports/?query=display_date:[now-2d+TO+now]&sort=display_date:desc"

[36murlRss[39m: [32mString[39m = [32m"https://www.chicagotribune.com/arcio/rss/category/sports/?query=display_date:[now-2d+TO+now]&sort=display_date:desc"[39m

In [7]:
//url post de reddit
val urlJson = "https://www.reddit.com/r/Android/hot/.json?count=10"

[36murlJson[39m: [32mString[39m = [32m"https://www.reddit.com/r/Android/hot/.json?count=10"[39m

In [84]:
//url preguntas de Stack Exchange
val urlStack = "https://api.stackexchange.com/2.2/questions?order=desc&sort=activity&site=stackoverflow"

[36murlStack[39m: [32mString[39m = [32m"https://api.stackexchange.com/2.2/questions?order=desc&sort=activity&site=stackoverflow"[39m

In [8]:
//Para probar que andan las excepciones
val urlTrucho = "Hola"

[36murlTrucho[39m: [32mString[39m = [32m"Hola"[39m

In [9]:
trait DataSource {
    def getResponse(url: String): String = {
        try{
            val response = Http(url)
            .timeout(connTimeoutMs = 2000, readTimeoutMs = 5000)
            .asString
            
            response.body
        } 
        catch{
            case ex: java.net.MalformedURLException => ""
            case ex: java.net.ConnectException => ""
            case ex: java.net.UnknownHostException => ""
        }
    }
    
    def parseResponse(url: String): Seq[String]
}

defined [32mtrait[39m [36mDataSource[39m

In [10]:
class RssSource(fields: Seq[String] = Seq("title", "description")) extends DataSource {
    def parseResponse(url: String): Seq[String] = {
        val xml = XML.loadString(getResponse(url))
        
        (xml\\ "item").map { item =>
            fields.map { field =>
                (item \ s"$field").text + " "
            }.foldLeft("")(_+_)
        }
    }
}

defined [32mclass[39m [36mRssSource[39m

In [11]:
class JsonSource(fields: Seq[String] = Seq("title", "selftext")) extends DataSource {        
    //Expresion regular urls
    val urlRegex = "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]".r

    def removeUrls(secuencia: Seq[String]): Seq[String] = {
            secuencia.map(urlRegex replaceAllIn(_, " "))
    }

    def parseResponse(url: String): Seq[String] = {
        val res = (parse(getResponse(url))\"data"\"children"\"data").extract[List[Map[String, Any]]]
        
        removeUrls(res.map { Map =>
            fields.map { field =>
                Map(field) + " "
            }.foldLeft("")(_+_)
        })
    }
}

defined [32mclass[39m [36mJsonSource[39m

In [89]:
class StackQSource(fields: Seq[String] = Seq("title")) extends DataSource {        
    def parseResponse(url: String): Seq[String] = {
        val res = (parse(getResponse(url))\"items").extract[List[Map[String, Any]]]
        
        res.map { mp =>
            fields.map { field =>
                mp(field) + " "
            }.foldLeft("")(_+_)
        }
    }
}

defined [32mclass[39m [36mStackQSource[39m

In [12]:
val rssSource = new RssSource()

[36mrssSource[39m: [32mRssSource[39m = ammonite.$sess.cmd9$Helper$RssSource@433c6b07

In [13]:
val jsonSource = new JsonSource(Seq("title", "selftext"))

[36mjsonSource[39m: [32mJsonSource[39m = ammonite.$sess.cmd10$Helper$JsonSource@280386d5

In [90]:
val stackQSource = new StackQSource()

[36mstackQSource[39m: [32mStackQSource[39m = ammonite.$sess.cmd88$Helper$StackQSource@6a1485d8

In [128]:
val listStack = stackQSource.parseResponse(urlStack)

[36mlistStack[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"Efficient python loop structure to repeat something once "[39m,
  [32m"On html, How to make a type box where the user can submit an answer and I can see all the answers in a text file or something? "[39m,
  [32m"What is the pythonic/idiomatic way of filtering the output of a generator expression? "[39m,
  [32m"Need directions for basics of HPC on MZ Azure "[39m,
  [32m"A google sheet api http request with &quot;Service account key&quot; auth and without google/apiclient "[39m,
  [32m"Using a callback only when a filter returns boolean value true "[39m,
  [32m"SSH2 issues wiht Node.js "[39m,
  [32m"PyKinectv2 Body tracking count "[39m,
  [32m"How do I sum matching strings in a range of ints &amp; display as a stacked bar plot? "[39m,
  [32m"Add time delay in each rows "[39m,
  [32m"Can&#39;t Output Seaborn Scatterplot to Tkinter Frame "[39m,
  [32m"Is it possible to add a &quot;.&quot; i

In [14]:
val listRss = rssSource.parseResponse(urlRss)

[36mlistRss[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"4 things we heard from Chicago Bears offensive assistants, including competition for Anthony Miller and high expectations for Germain Ifedi Chicago Bears offensive coaches talked last week about some key topics involving veteran players, including wide receivers Anthony Miller and Allen Robinson, tight end Jimmy Graham and right tackle Germain Ifedi. "[39m,
  [32m"When it comes to Justin Fields\u2019 development, \u2018time is the biggest question\u2019 for the Chicago Bears. A look back at past rookie QBs lends insight into the process. Conventional thinking is the Chicago Bears will want to begin the season with Andy Dalton at QB, as the last thing they want to do is risk damaging Justin Fields\u2019 development by throwing him in before he\u2019s ready. But he\u2019s also the shiny new toy in the building, so Matt Nagy and his staff will be tempted to play the rookie. "[39m,
  [32m"Chicago Cubs welcome 

In [15]:
val listJson = jsonSource.parseResponse(urlJson)

[36mlistJson[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"""Sunday Rant/Rage (May 16 2021) - Your weekly complaint thread! Note 1. Join our IRC, and Telegram chat-rooms! [Please see our wiki for instructions.]( )

This weekly Sunday thread is for you to let off some steam and speak out about whatever complaint you might have about:  

* Your device.  

* Your carrier.  

* Your device's manufacturer.  

* An app  

* Any other company

***  

**Rules**  

1) Please do not target any individuals or try to name/shame any individual. If you hate Google/Samsung/HTC etc. for one thing that is fine, but do not be rude to an individual app developer.

2) If you have a suggestion to solve another user's issue, please leave a comment but be sure it's constructive! We do not want any flame-wars.  

3) Be respectful of other's opinions. Even if you feel that somebody is "wrong" you don't have to go out of your way to prove them wrong. Disagree politely, and move on. """[39m,


## 2. Detectar las entidades nombradas

### 2.1 Crear el modelo

El **modelo** es sólo la función `getNEs`, que recibe una lista de textos.
Para cada texto, se separa las palabras del texto usando los espacios, y considera que es una entidad nombrada si empieza con mayúscula.

In [161]:
case class NERCount(ner: String, count:Double)

class NERModel(){
    // Este código lista los signos de puntuación y algunas palabras comunes del inglés que se van a sacar del texto.
    val STOPWORDS = Seq (
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
        "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
        "her", "hers", "herself", "it", "its", "itself", "they", "them", "your",
        "their", "theirs", "themselves", "what", "which", "who", "whom",
        "this", "that", "these", "those", "am", "is", "are", "was", "were",
        "be", "been", "being", "have", "has", "had", "having", "do", "does",
        "did", "doing", "a", "an", "the", "and", "but", "if", "or",
        "because", "as", "until", "while", "of", "at", "by", "for", "with",
        "about", "against", "between", "into", "through", "during", "before",
        "after", "above", "below", "to", "from", "up", "down", "in", "out",
        "off", "over", "under", "again", "further", "then", "once", "here",
        "there", "when", "where", "why", "how", "all", "any", "both", "each",
        "few", "more", "most", "other", "some", "such", "no", "nor", "not",
        "only", "own", "same", "so", "than", "too", "very", "s", "t", "can",
        "will", "just", "don", "should", "now", "on", "thank", "enjoy",
        "please", "may", "even", "forgot", "things", "well", "hey", "much",
        "lets", "look",
        // Contractions without '
        "im", "ive", "id", "Youre", "youd", "youve",
        "hes", "hed", "shes", "shed", "itd", "were", "wed", "weve",
        "theyre", "theyd", "theyve",
        "shouldnt", "couldnt", "musnt", "cant", "wont",
        // Common uppercase words
        "hi", "hello", "url"
    )
    val punctuationSymbols = ".,()!?;:'`´\n<>-’"
    val punctuationRegex = "\\" + punctuationSymbols.split("").mkString("|\\")
    
    // Extract Named Entities from a single text
    def getNEsSingle(text: String): Seq[String] =
      text.replaceAll(punctuationRegex, "").split(" ")
        .filter { word:String => word.length > 1 &&
                  Character.isUpperCase(word.charAt(0)) &&
                  !STOPWORDS.contains(word.toLowerCase) }.toSeq

    def getNEs(textList: Seq[String]): Seq[Seq[String]] = textList.map(getNEsSingle)
    
    //Count and sort Named Entities
    
    def getCounts(result: Seq[Seq[String]]): Seq[Map[String, Double]] = {
      val countsMaps: Seq[Map[String, Double]] = result.map { list =>
        list.foldLeft(Map.empty[String, Double]) {
          (count, word) => count + (word -> (count.getOrElse(word, 0.0) + 1.0))
        }
      } 
      countsMaps.map {
          Map => Map mapValues (_/Map.values.sum) }
    }
    

    def sortNEs(result: Seq[Seq[String]]): List[NERCount] = {
        val counts: Map[String, Double] = getCounts(result).flatten
            .foldLeft(Map.empty[String, Double]) {
            (count, wordCount) => count + (wordCount._1 -> (count.getOrElse(wordCount._1, 0.0) + wordCount._2)) }
        counts.toList.map { case (word, count) => NERCount(word, count) }
        .sortBy(_.count)(Ordering[Double].reverse)
    }
    
}

defined [32mclass[39m [36mNERCount[39m
defined [32mclass[39m [36mNERModel[39m

In [162]:
val model = new NERModel

[36mmodel[39m: [32mNERModel[39m = ammonite.$sess.cmd160$Helper$NERModel@5d848e99

### 2.2 Aplicar el "Modelo" a los datos

In [163]:
val resultRss = model.getNEs(listRss)

[36mresultRss[39m: [32mSeq[39m[[32mSeq[39m[[32mString[39m]] = [33mList[39m(
  [33mArrayBuffer[39m(
    [32m"Chicago"[39m,
    [32m"Bears"[39m,
    [32m"Anthony"[39m,
    [32m"Miller"[39m,
    [32m"Germain"[39m,
    [32m"Ifedi"[39m,
    [32m"Chicago"[39m,
    [32m"Bears"[39m,
    [32m"Anthony"[39m,
    [32m"Miller"[39m,
    [32m"Allen"[39m,
    [32m"Robinson"[39m,
    [32m"Jimmy"[39m,
    [32m"Graham"[39m,
    [32m"Germain"[39m,
    [32m"Ifedi"[39m
  ),
  [33mArrayBuffer[39m(
    [32m"Justin"[39m,
    [32m"Fields"[39m,
    [32m"Chicago"[39m,
    [32m"Bears"[39m,
    [32m"QBs"[39m,
    [32m"Conventional"[39m,
    [32m"Chicago"[39m,
    [32m"Bears"[39m,
    [32m"Andy"[39m,
    [32m"Dalton"[39m,
    [32m"QB"[39m,
    [32m"Justin"[39m,
    [32m"Fields"[39m,
    [32m"Matt"[39m,
    [32m"Nagy"[39m
  ),
  [33mArrayBuffer[39m(
    [32m"Chicago"[39m,
    [32m"Cubs"[39m,
...

In [165]:
val resultJson = model.getNEs(listJson)

[36mresultJson[39m: [32mSeq[39m[[32mSeq[39m[[32mString[39m]] = [33mList[39m(
  [33mArrayBuffer[39m(
    [32m"Sunday"[39m,
    [32m"Rant/Rage"[39m,
    [32m"Note"[39m,
    [32m"Join"[39m,
    [32m"IRC"[39m,
    [32m"Telegram"[39m,
    [32m"Sunday"[39m,
    [32m"Google/Samsung/HTC"[39m,
    [32m"Disagree"[39m
  ),
  [33mArrayBuffer[39m(
    [32m"Community"[39m,
    [32m"Feedback"[39m,
    [32m"Poll"[39m,
    [32m"February"[39m,
    [32m"Participation"[39m,
    [32m"NOTES*"[39m,
    [32m"Google"[39m,
    [32m"Forms"[39m,
    [32m"Email"[39m,
    [32m"Responses"[39m,
    [32m"POLL]"[39m,
    [32m"Edit"[39m
  ),
  [33mArrayBuffer[39m([32m"Google"[39m, [32m"I/O"[39m, [32m"Keynote"[39m),
  [33mArrayBuffer[39m([32m"Magisk"[39m, [32m"Apple"[39m, [32m"Androids"[39m),
  [33mArrayBuffer[39m([32m"Flexible"[39m, [32m"Samsung"[39m, [32m"OLED"[39m, [32m"SID"[39m, [32m"Display"[39m, [32m"Week"[39m),
  [33mArrayBuffer

In [164]:
val resultStack = model.getNEs(listStack)

[36mresultStack[39m: [32mSeq[39m[[32mSeq[39m[[32mString[39m]] = [33mList[39m(
  [33mArrayBuffer[39m([32m"Efficient"[39m),
  [33mArrayBuffer[39m(),
  [33mArrayBuffer[39m(),
  [33mArrayBuffer[39m([32m"Need"[39m, [32m"HPC"[39m, [32m"MZ"[39m, [32m"Azure"[39m),
  [33mArrayBuffer[39m(),
  [33mArrayBuffer[39m([32m"Using"[39m),
  [33mArrayBuffer[39m([32m"SSH2"[39m, [32m"Nodejs"[39m),
  [33mArrayBuffer[39m([32m"PyKinectv2"[39m, [32m"Body"[39m),
  [33mArrayBuffer[39m(),
  [33mArrayBuffer[39m([32m"Add"[39m),
  [33mArrayBuffer[39m(
    [32m"Can&#39t"[39m,
    [32m"Output"[39m,
    [32m"Seaborn"[39m,
    [32m"Scatterplot"[39m,
    [32m"Tkinter"[39m,
    [32m"Frame"[39m
  ),
  [33mArrayBuffer[39m([32m"Mathrandom"[39m),
  [33mArrayBuffer[39m([32m"Adding"[39m, [32m"Festival"[39m, [32m"Software"[39m),
  [33mArrayBuffer[39m(),
  [33mArrayBuffer[39m([32m"Storing"[39m, [32m"Unity"[39m, [32m"WebGL"[39m),
  [33mArrayBuf

## 3. Contar y ordenar las entidades

Concatenar todas las listas, contar cada Named Entity, y luego ordernar por frecuencia

In [166]:
val stackCounts = model.getCounts(resultStack)

[36mstackCounts[39m: [32mSeq[39m[[32mMap[39m[[32mString[39m, [32mDouble[39m]] = [33mList[39m(
  [33mMap[39m([32m"Efficient"[39m -> [32m1.0[39m),
  [33mMap[39m(),
  [33mMap[39m(),
  [33mMap[39m([32m"Need"[39m -> [32m0.25[39m, [32m"HPC"[39m -> [32m0.25[39m, [32m"MZ"[39m -> [32m0.25[39m, [32m"Azure"[39m -> [32m0.25[39m),
  [33mMap[39m(),
  [33mMap[39m([32m"Using"[39m -> [32m1.0[39m),
  [33mMap[39m([32m"SSH2"[39m -> [32m0.5[39m, [32m"Nodejs"[39m -> [32m0.5[39m),
  [33mMap[39m([32m"PyKinectv2"[39m -> [32m0.5[39m, [32m"Body"[39m -> [32m0.5[39m),
  [33mMap[39m(),
  [33mMap[39m([32m"Add"[39m -> [32m1.0[39m),
  [33mMap[39m(
    [32m"Frame"[39m -> [32m0.16666666666666666[39m,
    [32m"Seaborn"[39m -> [32m0.16666666666666666[39m,
    [32m"Can&#39t"[39m -> [32m0.16666666666666666[39m,
    [32m"Scatterplot"[39m -> [32m0.16666666666666666[39m,
    [32m"Output"[39m -> [32m0.16666666666666666[39m,
    [3

In [167]:
val rssCounts = model.getCounts(resultRss)

[36mrssCounts[39m: [32mSeq[39m[[32mMap[39m[[32mString[39m, [32mDouble[39m]] = [33mList[39m(
  [33mMap[39m(
    [32m"Ifedi"[39m -> [32m0.125[39m,
    [32m"Anthony"[39m -> [32m0.125[39m,
    [32m"Germain"[39m -> [32m0.125[39m,
    [32m"Chicago"[39m -> [32m0.125[39m,
    [32m"Allen"[39m -> [32m0.0625[39m,
    [32m"Graham"[39m -> [32m0.0625[39m,
    [32m"Miller"[39m -> [32m0.125[39m,
    [32m"Bears"[39m -> [32m0.125[39m,
    [32m"Robinson"[39m -> [32m0.0625[39m,
    [32m"Jimmy"[39m -> [32m0.0625[39m
  ),
  [33mMap[39m(
    [32m"Matt"[39m -> [32m0.06666666666666667[39m,
    [32m"QBs"[39m -> [32m0.06666666666666667[39m,
    [32m"QB"[39m -> [32m0.06666666666666667[39m,
    [32m"Conventional"[39m -> [32m0.06666666666666667[39m,
    [32m"Chicago"[39m -> [32m0.13333333333333333[39m,
    [32m"Dalton"[39m -> [32m0.06666666666666667[39m,
    [32m"Nagy"[39m -> [32m0.06666666666666667[39m,
    [32m"Andy"[39m -> [32

In [168]:
rssCounts.map { mp => mp.values.sum }

[36mres167[39m: [32mSeq[39m[[32mDouble[39m] = [33mList[39m(
  [32m1.0[39m,
  [32m0.9999999999999999[39m,
  [32m1.0000000000000002[39m,
  [32m1.0[39m,
  [32m1.0[39m,
  [32m0.9999999999999998[39m,
  [32m0.9999999999999999[39m,
  [32m0.9999999999999998[39m,
  [32m0.9999999999999998[39m,
  [32m1.0000000000000002[39m,
  [32m1.0[39m,
  [32m1.0[39m,
  [32m1.0[39m,
  [32m1.0[39m,
  [32m0.9999999999999999[39m,
  [32m1.0[39m,
  [32m1.0[39m,
  [32m0.9999999999999998[39m,
  [32m1.0[39m,
  [32m1.0[39m,
  [32m1.0[39m,
  [32m1.0[39m,
  [32m1.0[39m,
  [32m0.9999999999999996[39m,
  [32m1.0[39m
)

In [169]:
val sortedNEs = model.sortNEs(resultRss)

[36msortedNEs[39m: [32mList[39m[[32mNERCount[39m] = [33mList[39m(
  [33mNERCount[39m([32m"Chicago"[39m, [32m3.0354384177913585[39m),
  [33mNERCount[39m([32m"Cubs"[39m, [32m1.0075396825396825[39m),
  [33mNERCount[39m([32m"White"[39m, [32m0.7749766573295984[39m),
  [33mNERCount[39m([32m"Sox"[39m, [32m0.6712184873949579[39m),
  [33mNERCount[39m([32m"Bears"[39m, [32m0.583008658008658[39m),
  [33mNERCount[39m([32m"Bulls"[39m, [32m0.5357142857142857[39m),
  [33mNERCount[39m([32m"Field"[39m, [32m0.4913632119514473[39m),
  [33mNERCount[39m([32m"Kyle"[39m, [32m0.42658730158730157[39m),
  [33mNERCount[39m([32m"Photos"[39m, [32m0.39470333587980644[39m),
  [33mNERCount[39m([32m"Fields"[39m, [32m0.3865800865800866[39m),
  [33mNERCount[39m([32m"Justin"[39m, [32m0.3865800865800866[39m),
  [33mNERCount[39m([32m"Abreu"[39m, [32m0.3537581699346405[39m),
  [33mNERCount[39m([32m"Wrigley"[39m, [32m0.346031746031746[39m),


In [170]:
val sortiaDo = model.sortNEs(resultJson)

[36msortiaDo[39m: [32mList[39m[[32mNERCount[39m] = [33mList[39m(
  [33mNERCount[39m([32m"Android"[39m, [32m2.1337923728813557[39m),
  [33mNERCount[39m([32m"Google"[39m, [32m1.8753177966101695[39m),
  [33mNERCount[39m([32m"Pixel"[39m, [32m0.9929378531073445[39m),
  [33mNERCount[39m([32m"New"[39m, [32m0.7873015873015872[39m),
  [33mNERCount[39m([32m"Pro"[39m, [32m0.723340395480226[39m),
  [33mNERCount[39m([32m"Androids"[39m, [32m0.6002824858757062[39m),
  [33mNERCount[39m([32m"I/O"[39m, [32m0.5502824858757063[39m),
  [33mNERCount[39m([32m"BinaryEye"[39m, [32m0.5[39m),
  [33mNERCount[39m([32m"Microsoft"[39m, [32m0.43333333333333335[39m),
  [33mNERCount[39m([32m"Display"[39m, [32m0.41666666666666663[39m),
  [33mNERCount[39m([32m"Apple"[39m, [32m0.4096045197740113[39m),
  [33mNERCount[39m([32m"RAM"[39m, [32m0.3730579096045198[39m),
  [33mNERCount[39m([32m"Samsung"[39m, [32m0.3584297919250379[39m),
  [33mN

In [171]:
val sortedStack = model.sortNEs(resultStack)

[36msortedStack[39m: [32mList[39m[[32mNERCount[39m] = [33mList[39m(
  [33mNERCount[39m([32m"Using"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Efficient"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Determine"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Add"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Route"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Flask"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Mathrandom"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Order"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Type"[39m, [32m0.6666666666666666[39m),
  [33mNERCount[39m([32m"Request"[39m, [32m0.5[39m),
  [33mNERCount[39m([32m"Format"[39m, [32m0.5[39m),
  [33mNERCount[39m([32m"SSH2"[39m, [32m0.5[39m),
  [33mNERCount[39m([32m"JSON"[39m, [32m0.5[39m),
  [33mNERCount[39m([32m"Change"[39m, [32m0.5[39m),
  [33mNERCount[39m([32m"Due"[39m, [32m0.5[39m),
  [33mNERCount[39m([32m"Nodejs"[39m, [32m0.5[39m),
  

## 4. Modelo "FeedService"

In [202]:
class FeedServices {
    case class Subscription(url: String, parser: DataSource)
    //variable donde se guardan las suscripciones
    var subsList: Seq[Subscription] = Seq()
    
    //funcion para agregar subscripciones a la lista 
    def subscribe[T <: DataSource](template: String, urlParams: Seq[String] = Seq(), parser: T) = {
        val urls: Seq[String] = (if (urlParams.isEmpty) { Seq(template) } else {
            urlParams.map { param => template.format(param) }
        //filtramos para no agregar de nuevo una url que ya estaba en la lista:
        }).filter { url =>
            !(subsList.map(_.url).contains(url))
        }        
        
        val subscriptions: Seq[Subscription] = urls.map {url =>
            Subscription(url, parser)
        }
        
        subsList = subsList ++ subscriptions
    }
    
    //busca todas las entidades en las subscripciones
    //y las añade a una misma lista
    def parseAll(): Seq[String] = {    
        val parsedList = subsList.map { sub =>
            sub.parser.parseResponse(sub.url)
        }
        parsedList.flatten
    }
}

defined [32mclass[39m [36mFeedServices[39m

In [232]:
val service = new FeedServices

[36mservice[39m: [32mFeedServices[39m = ammonite.$sess.cmd201$Helper$FeedServices@7ffea8f6

In [233]:
//Probamos omitir el parámetro opcional urlParam
service.subscribe(template = "https://www.chicagotribune.com/arcio/rss/category/business/?query=display_date:[now-2d+TO+now]&sort=display_date:desc", urlParams = Seq(), parser = new RssSource())
//Probamos con parámetro
service.subscribe(template = "https://rss.nytimes.com/services/xml/rss/nyt/%s.xml", urlParams = Seq("Business", "Technology"), parser = new RssSource())
//service.subscribe(template = "https://www.reddit.com/r/%s/hot/.json?count=100", urlParams = Seq("Marketing", "Sales", "Entrepreneur", "Startups"), parser = new JsonSource())
//service.subscribe(template = "https://api.stackexchange.com/2.2/questions?order=desc&sort=activity&site=%s", urlParams = Seq("stackoverflow", "serverfault"), parser = new StackQSource())

In [234]:
println(service.subsList)

List(Subscription(https://www.chicagotribune.com/arcio/rss/category/business/?query=display_date:[now-2d+TO+now]&sort=display_date:desc,ammonite.$sess.cmd9$Helper$RssSource@467ef14b), Subscription(https://rss.nytimes.com/services/xml/rss/nyt/Business.xml,ammonite.$sess.cmd9$Helper$RssSource@352dd2db), Subscription(https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml,ammonite.$sess.cmd9$Helper$RssSource@352dd2db))


In [235]:
val texts = service.parseAll()

[36mtexts[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"\u2018Car guy\u2019 Joe Biden took a spin in the new electric-powered Ford F-150 Lightning during a trip to Michigan to sell his $2T infrastructure plan Joe Biden took a spin in the new Ford F-150 Lightning truck during a visit to a Ford safety testing center Tuesday as part of a trip to Michigan. "[39m,
  [32m"COVID-19 has left scars on the job market. Here\u2019s why the damage may be permanent. At least 30% of the U.S. jobs lost to the pandemic aren\u2019t expected to come back \u2014 a sizable proportion of them at employers that require face-to-face contact with consumers: Hotels, restaurants, retailers, entertainment venues. "[39m,
  [32m"In Chicago and across US, downtown stores, restaurants await customers\u2019 return \u201cI definitely think the Loop will return back to normal at some point,\u201d says Teresa Ging of Sugar Bliss Bakery, although she doesn\u2019t expect that to happen before 2022. "

In [236]:
val model = new NERModel()

[36mmodel[39m: [32mNERModel[39m = ammonite.$sess.cmd160$Helper$NERModel@406ef726

In [237]:
val res = model.getNEs(texts)

[36mres[39m: [32mSeq[39m[[32mSeq[39m[[32mString[39m]] = [33mList[39m(
  [33mArrayBuffer[39m(
    [32m"Joe"[39m,
    [32m"Biden"[39m,
    [32m"Ford"[39m,
    [32m"F150"[39m,
    [32m"Lightning"[39m,
    [32m"Michigan"[39m,
    [32m"Joe"[39m,
    [32m"Biden"[39m,
    [32m"Ford"[39m,
    [32m"F150"[39m,
    [32m"Lightning"[39m,
    [32m"Ford"[39m,
    [32m"Tuesday"[39m,
    [32m"Michigan"[39m
  ),
  [33mArrayBuffer[39m([32m"COVID19"[39m, [32m"Heres"[39m, [32m"US"[39m, [32m"Hotels"[39m),
  [33mArrayBuffer[39m(
    [32m"Chicago"[39m,
    [32m"US"[39m,
    [32m"Loop"[39m,
    [32m"Teresa"[39m,
    [32m"Ging"[39m,
    [32m"Sugar"[39m,
    [32m"Bliss"[39m,
    [32m"Bakery"[39m
  ),
  [33mArrayBuffer[39m(
    [32m"Boeing"[39m,
    [32m"FAA"[39m,
    [32m"Max"[39m,
    [32m"Dreamliner"[39m,
    [32m"Two"[39m,
    [32m"Congress"[39m,
    [32m"Boeing"[39m,
    [32m"Federal"[39m,
    [32m"Aviation"[39m,
    [32

In [238]:
val finalRes = model.sortNEs(res)

[36mfinalRes[39m: [32mList[39m[[32mNERCount[39m] = [33mList[39m(
  [33mNERCount[39m([32m"China"[39m, [32m1.222843822843823[39m),
  [33mNERCount[39m([32m"Walmart"[39m, [32m1.2222222222222223[39m),
  [33mNERCount[39m([32m"US"[39m, [32m1.2191770647653002[39m),
  [33mNERCount[39m([32m"Biden"[39m, [32m1.1350140056022409[39m),
  [33mNERCount[39m([32m"WarnerMedia"[39m, [32m1.0654382167540062[39m),
  [33mNERCount[39m([32m"AT&T"[39m, [32m1.0071770334928232[39m),
  [33mNERCount[39m([32m"Coinbase"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Buying"[39m, [32m1.0[39m),
  [33mNERCount[39m([32m"Apple"[39m, [32m0.9803030303030303[39m),
  [33mNERCount[39m([32m"Discovery"[39m, [32m0.9667710944026733[39m),
  [33mNERCount[39m([32m"President"[39m, [32m0.9598039215686275[39m),
  [33mNERCount[39m([32m"Pipeline"[39m, [32m0.804040404040404[39m),
  [33mNERCount[39m([32m"Colonial"[39m, [32m0.804040404040404[39m),
  [33mNERCount

In [201]:
println(finalRes)

List(NERCount(Chicago,3.454145299145299), NERCount(County,2.052802181478652), NERCount(Tuesday,1.908567821067821), NERCount(US,1.80413377472201), NERCount(El,1.6225579975579976), NERCount(New,1.5419413919413918), NERCount(Biden,1.4540616246498597), NERCount(Illinois,1.3333333333333333), NERCount(China,1.222843822843823), NERCount(WarnerMedia,1.0654382167540062), NERCount(Lollapalooza,1.0602564102564103), NERCount(Park,1.0444555444555443), NERCount(Es,1.0), NERCount(Coinbase,1.0), NERCount(Buying,1.0), NERCount(Apple,0.9803030303030303), NERCount(Wednesday,0.892156862745098), NERCount(Monday,0.880079365079365), NERCount(Federal,0.8671568627450981), NERCount(President,0.8526610644257704), NERCount(AT&T,0.8405103668261564), NERCount(Pipeline,0.804040404040404), NERCount(Colonial,0.804040404040404), NERCount(Discovery,0.8001044277360067), NERCount(Amazon,0.7816239316239315), NERCount(Washington,0.7598039215686274), NERCount(Trump,0.7556489262371615), NERCount(Police,0.7529151404151404), NE

## 4. Modelo Marcos

1- Hay vario sitios de noticia, por ejemplo tenemos el sitio de noticia del diario la nación, de infobae, de radio mitre.

Modelamos esto con una clase tipo servicio de noticas. Por ejemplo creamos un objeto del sitio LaNacion

*servicioDeNoticias = new ServicioDeNoticas(“LaNacion”);*


2- Cada uno de los sitios de noticia tiene varias formas de acceder a las noticias. Por ejemplo se pueden leer las noticias de la nación en formato xml, en formato url o en texto plano.

Modelamos esto haciendo que el objeto servicioDeNoticia tenga un método obtenerArticulosDesdeUnFormatoXML  que nos devuelve un listado de artículos. Cada articulo tiene  un método título y un método descripción.

*listaDeArticulo = servicioDeNoticia.ObtenerArticulosFormatoXML(url)*



3- Hay dos modelos de clasificación, uno usando un conjunto de reglas y el otro usando redes neuronales. Los modelos tienen un método detectarEntidadesNombradas que reciben una listaDeArticulos y devuelve una lista de entidadesNombradas. 

Modelamos esto usando herencia. Las clases ModeloDeClasificacionUsandoConjuntoDeReglas y  ModeloDeClasificacionUsandoRedesNeuronales heredan de ModeloDeClasificacion. 

La clase ModeloDeClasificacion tiene un metodo virtual DetectarEntidadesNombradas que recibe como parametros una listaDeArticulos y devuelve una listaDeEntidadesNombradas. Las entidades nombradas tienen un método nombre y un método frecuencia.

Modelamos esto de la siguiente manera:

*listaDeEntidadesNombradas = ModeloDeClasificacionUsandoConjuntoDeReglas.DetectarEntidadesNombradas(ListaDeArticulos)*


#Pseudocodigo

In [None]:
listaDeArticulos = new ListaDeArticulos(Articulos)
listaDeEntidadesNombradas = new ListaDeEntidadesNombradas(EntidadesNombradas)


In [None]:
servicioDeNoticasLaNacion = new ServicioDeNoticas("LaNacion")
direccionUrlDeLasNoticiasConFormatoXML = "http://unNombreDeDominio.unArchivo.xml"
listaDeArticulo = servicioDeNoticiaLaNacion.ObtenerArticulosDesdeUnFormatoXML(direccionUrlDeLasNoticiasConFormatoXML)
listaDeEntidadesNombradas = ModeloDeClasificacionUsandoConjuntoDeReglas.detectarEntidadesNombradas(ListaDeArticulos)
for entidadNombrada in listaDeEntidadesNombradas:
    print entidadNombrada.Nombre
    print entidadNombrada.Frecuencia


In [None]:
direccionUrlDeLasNoticiasConFormatoJSON = "http://wxdd.ss.archivo.json"
listaDeArticulos = servicioDeNoticiaLaNacion.ObtenerArticulosDesdeUnFormatoJSON(direccionUrlDeLasNoticiasConFormatoJSON)
listaDeEntidadesNombradas = ModeloDeClasificacionUsandoConjuntoDeReglas.detectarEntidadesNombradas(ListaDeArticulos)
for entidadNombrada in listaDeEntidadesNombradas:
    print entidadNombrada.Nombre
    print entidadNombrada.Frecuencia


In [None]:
servicioDeNoticasInfobae = new ServicioDeNoticas("Infobae")
listaDeEntidadesNombradasInfobaeTextoPlano = ListaDeEntidadesNombradas(EntidadesNombradas)
direccionUrlDeLasNoticiasConFormatoTextoPlano = "c:/users/documents/archivo.txt"
listaDeArticulos = servicioDeNoticiaInfobae.ObtenerArticulosDesdeUnFormatoTextoPlano(direccionUrlDeLasNoticiasConFormatoTextoPlano)

listaDeEntidadesNombradas = ModeloDeClasificacionUsandoConjuntoDeReglas.detectarEntidadesNombradas(ListaDeArticulos)

for entidadNombrada in listaDeEntidadesNombradas:
    print entidadNombrada.Nombre
    print entidadNombrada.Frecuencia


In [None]:
Clase: ServicioDeNoticia
Metodos:
obtenerArticulosDesdeUnFormatoXML(direccionUrl):ListaDeArticulos
obtenerArticulosDesdeUnFormatoJSON(direccionUrl):ListaDeArticulos

ListaDeArticulos
Clase:Articulo
Metodos:
  Titulo
  Descripcion


ModeloDeClasificacionUsandoRedesNeuronales:ModeloDeClasificacion
ModeloDeClasificacionUsandoConjuntoDeReglas:ModeloDeClasificacion
detectarEntidadesNombradas(ListaDeArticulos):EntidadesNombradas

ListaDeEntidadesNombradas
Clase: EntidadesNombradas
Metodos:
  Nombre
  Frecuencia
