# Exploring and Analyzing Text Data Assignment 

In [1]:
import spacy
import string
import pandas as pd
from nltk import pos_tag
from nltk.text import Text
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [2]:
path = '/content/drive/MyDrive/Datasets/cnn_articles'
doc_pattern = r'.*\.txt'
corpus = PlaintextCorpusReader(path, doc_pattern)
corpus

OSError: ignored

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [None]:
corpus.fileids()

In [None]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]
docs[0]

### Write a function that calculates the following statistics for a document and returns them as a list.

- Number of sentences
- Number of tokens
- Number of words (no stop words or punctuation)
- Number of unique words (vocabulary)
- Number of unique named entities (excluding numbers, dates, times, and currency types)
- Average sentence length
- Average word length
- Lexical diversity

In [None]:
def doc_stats(doc):
    sents = sent_tokenize(doc)
    tokens = word_tokenize(doc)
    words = [token.lower() for token in tokens
            if token.lower() not in stopwords.words('english')
            if token not in string.punctuation]
    num_sents = len(sents)
    num_tokens = len(tokens)
    num_words = len(words)
    vocab = len(set(words))
    chars = sum([len(word) for word in words])

    spacy_doc = nlp(doc)
    remove = ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

    ents = [ent.text for ent in spacy_doc.ents
           if ent.label_ not in remove]

    num_ents = len(set(ents))
    words_sent = num_words/num_sents
    chars_word = chars/num_words
    lex_div = vocab/num_words

    stats = [num_sents, num_tokens, num_words, vocab, num_ents, words_sent,
            chars_word, lex_div]
    return stats

doc_stats(docs[0])

### Iterate through all the documents, calculate these statistics for each one, and store all the results in a Pandas data frame.

In [None]:
columns = ['Number of sentences', 'Number of tokens', 'Number of words',
          'Vocabulary', 'Number of Named Entities', 'Avg sentence length',
          'Avg word length', 'Lexical diversity']
stats = [doc_stats(doc) for doc in docs]
stats_df = pd.DataFrame(stats, columns=columns)
stats_df

### Summarize these statistics for the entire corpus by calling the Pandas `describe` method.

In [None]:
stats_df.describe()

### Choose a document from the list of documents you created earlier and generate a frequency distribution bar chart for it showing which terms appear most frequently in the text.

In [None]:
doc = docs[10]

cleaned = [token.lower() for token in word_tokenize(doc)
          if token.lower() not in stopwords.words('english')
          if token.isalpha()]

fdist = FreqDist(cleaned)
fdist

In [None]:
df = pd.DataFrame.from_dict(fdist, orient='index').reset_index()
df

In [None]:
df.columns = ['term', 'freq']
df

In [None]:
fdist_sorted = fdist_df.sort_values(by='freq', ascending=False)
fdist_sorted

In [None]:
fdist_filtered = fdist_sorted.loc[lambda x: x.freq > 4]
fdist_filtered

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(data=fdist_filtered, x='freq', y='term')
plt.title('Term frequency distribution')
plt.show()

### Generate a word cloud visualization for the same document for which you generated the frequency distribution.

In [None]:
cloud = WordCloud(width=1200, height=900, stopwords=STOPWORDS).generate(doc)
plt.figure(figsize=(12,9))
plt.imshow(cloud)
plt.axis('off')
plt.show()

### Choose a different article (preferably one that references several named entities) and create a dispersion plot that shows the occurrence of those entities throughout the document.

In [None]:
doc = docs[4]
spacy_doc = nlp(doc)
remove = ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 
          'CARDINAL', 'LOC', 'ORG', 'EVENT', 'NORP', 'GPE']
ents = [ent.text for ent in spacy_doc.ents
           if ent.label_ not in remove]
terms = list(set(ents))

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 12, 10

tokenized = word_tokenize(doc)
Text(tokenized).dispersion_plot(terms)

### Choose another article and generate a POS visualization highlighting the parts of speech for tokens in the article.

In [4]:
from yellowbrick.text.postag import PosTagVisualizer



In [None]:
doc = docs[9]
doc

In [None]:
tokens = word_tokenize(doc)
tagged = pos_tag(tokens)

visualizer = PosTagVisualizer()
visualizer.transform(tagged)

print(' '.join([visualizer.colorize(token, color) for color, token in visualizer.tagged]))