-
Notifications
You must be signed in to change notification settings - Fork 0
/
Ejercicios_TM.R
31 lines (26 loc) · 1.03 KB
/
Ejercicios_TM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
## Cargo las librerias
library(tm)
library(wordcloud)
library(rtext)
## creo el corpus
docs <- Corpus(DirSource("C:/Users/Layla Scheli/Desktop/Docencia/BA Emprende/Programa en Ciencia de Datos/6. Clase/Ejercicio_TM/Datos textos/Libros"))
## limpio el corpus
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, toSpace, "´")
docs <- tm_map(docs, toSpace, "`")
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs,content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords, stopwords("spanish"))
docs <- tm_map(docs, stripWhitespace)
## construyo la matriz de textos por documentos
dtm <- DocumentTermMatrix(docs)
freq <- colSums(as.matrix(dtm))
ord <- order(freq,decreasing=TRUE)
## Wordcloud
freq2 <- freq[which(freq>=200)]
wordcloud(names(freq2),freq2)