In [19]:
## IMPORTS
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import spacy
from nltk.tag import pos_tag
from nltk import CFG
from nltk.parse import ChartParser
import pandas as pd

In [20]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [21]:
dataframe = pd.read_csv('./clean_d_tweets.csv')

In [22]:
tweets = dataframe['tweet']

# **Language Modelling Using N Grams Model**

In [23]:
## N Grams Language Model
class NGramsLanguageModel :
    def __init__ (self, n) :
        self.n = n
        self.vectorizer = CountVectorizer(analyzer='word', ngram_range=(n, n))
        self.corpus = self.get_corpus()
        self.matrix = self.fit_transform(self.corpus)

    def get_corpus(self) :
        tweet_list = [tweet if isinstance(tweet, str) else '' for tweet in tweets]
        return tweet_list

    def calculate_cosine_similarity(self, query_v) :
        similarities = cosine_similarity(self.matrix, query_v)
        return similarities

    def fit_transform(self, corpus) :
        return self.vectorizer.fit_transform(corpus)

    def transform(self, corpus) :
        return self.vectorizer.transform(corpus)

    def get_most_similar_sentence(self, sentence) :
        sentence_to_print = ''
        max = 0
        similarity = self.get_similarity(sentence)
        for tweet, s in zip(self.corpus, similarity) :
            if(max < s[0]) :
                max = s[0]
                sentence_to_print = tweet
        print(f'Most similar sentence : {sentence_to_print}')
        print(f'Similarity : {max * 100}%')

    def get_similarity(self, sentence) :
        query = self.transform([sentence])
        similarity = self.calculate_cosine_similarity(query)
        return similarity

# **Word Embedding using Word2Vec**

In [24]:
# Word Embedding
class WordToVec :
    def __init__ (self) :
        self.sentences = self.get_all_sentences_from_tweet()
        self.model = None
        self.loaded_model = None
        self.file_name = "word2vec_model.bin"
        self.make_model()

    def get_all_sentences_from_tweet(self) :
        sents = []
        for tweet in tweets:
            try :
                tokens = word_tokenize(tweet.lower())
                sents.append(tokens)
            except AttributeError as e :
                pass
        return sents


    def make_model(self) :
        self.model = Word2Vec(sentences=self.sentences, vector_size=100, window=5, min_count=1, workers=4)
        self.model.save(self.file_name)
        self.loaded_model = Word2Vec.load(self.file_name)

    def get_n_most_common_words_compare_to_word(self, word, n=10) :
        most_similar_with_word = self.loaded_model.wv.most_similar(word, topn=n)
        for word, similarity in most_similar_with_word :
            print(f"Word       : {word}")
            print(f"Similarity : {similarity * 100}%")

# **Name Entity Recognition Using Spacy**

In [25]:
# Class For Name Entity Recognition

class NameEntityRecognition() :
    def __init__(self, sentence) :
        self.process = spacy.load("en_core_web_sm")
        self.document = self.process(sentence)
        self.get_sentence_entities()

    def get_sentence_dependencies(self) :
        for token in self.document:
            print('==========================================')
            print(f'Text              : {token.text}')
            print(f'Head              : {token.head}')
            print(f'Dependecy To Head : {token.dep_}')
            print(f'Position.         : {token.pos_}')

    def get_sentence_entities(self) :
        self.entities = {
            "Person" : [ent.text for ent in self.document.ents if ent.label_ == 'PERSON'],
            "Location" : [ent.text for ent in self.document.ents if ent.label_ == 'GPE'],
            "Event" : [ent.text for ent in self.document.ents if ent.label_ == 'EVENT'],
            "Date" : [ent.text for ent in self.document.ents if ent.label_ == 'DATE'],
            "Time" : [ent.text for ent in self.document.ents if ent.label_ == 'TIME'],
            "Organization" : [ent.text for ent in self.document.ents if ent.label_ == 'ORG'],
            "Quantity" : [ent.text for ent in self.document.ents if ent.label_ == 'QUANTITY'],
            "Money" : [ent.text for ent in self.document.ents if ent.label_ == 'MONEY'],
        }

    def print_sentence_entities(self) :
        for entity, texts in self.entities.items() :
            print(f'Text for Entity -> {entity}')
            for text in texts :
                print("\t- " + text)





# **Dependency Parsing Using Spacy**

In [26]:
# Class For Dependency Parsing

class DependencyParsing:
    def __init__ (self, sentence) :
        self.sentence = sentence
        self.processor = spacy.load("en_core_web_sm")
        self.document = self.processor(self.sentence)

    def get_sentence_dependencies(self) :
        for token in self.document:
            print('==========================================')
            print(f'Text              : {token.text}')
            print(f'Head              : {token.head}')
            print(f'Dependecy To Head : {token.dep_}')
            print(f'Position.         : {token.pos_}')

# **Grammar Parsing**

In [33]:
class GrammarParsing :
    def __init__ (self) :
       self.example_sentence = 'The Doctor killed his mother yesterday'
       self.grammar = CFG.fromstring('''
            S -> NP VP
            NP -> Det N
            VP -> V NP T | V N P NP
            N -> 'Doctor' | 'mother'
            Det -> 'The' | 'the' | 'his'
            V -> 'killed'
            PP -> P N
            P -> 'with' | 'into'
            T -> 'yesterday' | 'today'
        ''')

    def grammar_parsing(self) :
        words = self.example_sentence.split()
        parser = ChartParser(self.grammar)

        try :
            parses = list(parser.parse(words))
            if parses :
                for tree in parser.parse(self.example_sentence.split()):
                    print(tree, "\n")
                    tree.pretty_print()
            else :
                print("no parses found")
        except nltk.parse.api.ParseError as e :
            print("Error during parsing : ", e)

In [45]:

while(True) :
    ngram = NGramsLanguageModel(2)
    wtv = WordToVec()
    gp = GrammarParsing()
    print("")
    print("NOTES -> language model for n gram and word 2 vec is about depressions tweet in twitter")
    print("1. n gram model")
    print("2. word 2 vec")
    print("3. ner")
    print("4. dependency parsing")
    print("5. grammar parsing")
    choice = input()


    if choice == '1' :
        sentence = input()
        ngram.get_most_similar_sentence(sentence)
    elif choice == '2' :
        sentence = input()
        wtv.get_n_most_common_words_compare_to_word(sentence)
    elif choice == '3' :
        sentence = input()
        ner = NameEntityRecognition(sentence)
        ner.print_sentence_entities()
    elif choice == '4' :
        sentence = input()
        dp = DependencyParsing(sentence)
        dp.get_sentence_dependencies()
    elif choice == '5' :
        gp.grammar_parsing()
    else :
        break


NOTES -> language model for n gram and word 2 vec is about depressions tweet in twitter
1. n gram model
2. word 2 vec
3. ner
4. dependency parsing
5. grammar parsing
3
I live in Indonesia where i usually kill someone please Elon Musk SpaceX Apple
Text for Entity -> Person
	- Elon Musk
Text for Entity -> Location
	- Indonesia
Text for Entity -> Event
Text for Entity -> Date
Text for Entity -> Time
Text for Entity -> Organization
	- Apple
Text for Entity -> Quantity
Text for Entity -> Money

NOTES -> language model for n gram and word 2 vec is about depressions tweet in twitter
1. n gram model
2. word 2 vec
3. ner
4. dependency parsing
5. grammar parsing
5
(S
  (NP (Det The) (N Doctor))
  (VP (V killed) (NP (Det his) (N mother)) (T yesterday))) 

                 S                            
      ___________|_____                        
     |                 VP                     
     |            _____|_________________      
     NP          |         NP            |    
  ___|_

KeyboardInterrupt: Interrupted by user