In [1]:
!pip install textblob



In [2]:
import spacy
from spacy.matcher import Matcher
from textblob import TextBlob

# Load the transformer-based model
nlp = spacy.load("en_core_web_trf")

# Define the headlines
headlines = [
    "Tomato rates soar as Centre looks to south for new stock",
    "Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate"
]

# Define categories and keywords with synonyms
positive_keywords = ["approves", "investment", "expansion", "success", "high", "best ever", "fly high", "jump", "rallied", "surge", "increase", "rise", "gain", "soar", "profit"]
negative_keywords = ["loser", "fall", "decline", "drop", "decrease", "plunge", "slump", "crash", "fraud"]
sectorial_keywords = ["energy stocks", "aviation", "renewable energy"]
recommendation_keywords = ["planning to buy", "brokerages views", "recommendations"]
deal_keywords = ["raising", "investment target", "IPO", "stake", "valuation"]

# Initialize the matcher
matcher = Matcher(nlp.vocab)

# Define patterns for specific phrases including entities
patterns = [
    [{"ENT_TYPE": "PERSON"}, {"LOWER": "of"}, {"ENT_TYPE": "ORG"}, {"LOWER": "top"}, {"LOWER": "picks"}],
    [{"ENT_TYPE": "PERSON"}, {"LOWER": "suggested"}, {"LOWER": "this"}],
    [{"LOWER": "breakout"}, {"LOWER": "stock"}],
    [{"LOWER": "price"}, {"LOWER": "volume"}, {"LOWER": "breakouts"}],
    [{"ENT_TYPE": "PERSON"}, {"LOWER": "recommends"}, {"LOWER": "this"}]
]

# Add patterns to matcher
for pattern in patterns:
    matcher.add("TOP_PICK_PATTERNS", [pattern])

# Function to classify headline
def classify_headline(headline):
    doc = nlp(headline)
    matches = matcher(doc)
    
    if matches:
        return "Top Picks"
    
    category = None
    sentiment = TextBlob(headline).sentiment.polarity

    # Dependency parsing and context check
    for token in doc:
        if token.lemma_ in positive_keywords and any([ent.label_ == "ORG" for ent in doc.ents]):
            return "Positive News"
        elif token.lemma_ in negative_keywords and any([ent.label_ == "ORG" for ent in doc.ents]):
            return "Negative News"
        elif token.lemma_ in sectorial_keywords:
            return "Sectorial News"
        elif token.lemma_ in recommendation_keywords:
            return "Stock Recommendations"
        elif token.lemma_ in deal_keywords:
            return "Deals"
    
    # Use sentiment analysis to determine positive or negative context
    if sentiment > 0:
        return "Positive News"
    elif sentiment < 0:
        return "Negative News"
    
    return category

# Process headlines and classify
alerts = []
for headline in headlines:
    category = classify_headline(headline)
    if category:
        alerts.append((headline, category))

# Print alerts
for alert in alerts:
    print(f"Category: {alert[1]}\nHeadline: {alert[0]}\n")


Category: Positive News
Headline: Tomato rates soar as Centre looks to south for new stock



In [3]:
import spacy
from spacy.matcher import Matcher
from textblob import TextBlob
from transformers import pipeline
import torch

# Load the transformer-based model for spaCy
nlp = spacy.load("en_core_web_trf")

# Load the sentiment analysis pipeline from transformers
sentiment_pipeline = pipeline("sentiment-analysis")

# Define the headlines
headlines = [
    "Tomato rates soar as Centre looks to south for new stock",
    "Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate"
]

# Define categories and keywords with synonyms
positive_keywords = ["approves", "investment", "expansion", "success", "high", "best ever", "fly high", "jump", "rallied", "surge", "increase", "rise", "gain", "soar", "profit"]
negative_keywords = ["loser", "fall", "decline", "drop", "decrease", "plunge", "slump", "crash", "fraud"]
sectorial_keywords = ["energy stocks", "aviation", "renewable energy"]
recommendation_keywords = ["planning to buy", "brokerages views", "recommendations"]
deal_keywords = ["raising", "investment target", "IPO", "stake", "valuation"]

# Initialize the matcher
matcher = Matcher(nlp.vocab)

# Define patterns for specific phrases including entities
patterns = [
    [{"ENT_TYPE": "PERSON"}, {"LOWER": "of"}, {"ENT_TYPE": "ORG"}, {"LOWER": "top"}, {"LOWER": "picks"}],
    [{"ENT_TYPE": "PERSON"}, {"LOWER": "suggested"}, {"LOWER": "this"}],
    [{"LOWER": "breakout"}, {"LOWER": "stock"}],
    [{"LOWER": "price"}, {"LOWER": "volume"}, {"LOWER": "breakouts"}],
    [{"ENT_TYPE": "PERSON"}, {"LOWER": "recommends"}, {"LOWER": "this"}]
]

# Add patterns to matcher
for pattern in patterns:
    matcher.add("TOP_PICK_PATTERNS", [pattern])

# Function to perform sentiment analysis using transformers
def transformer_sentiment_analysis(headline):
    result = sentiment_pipeline(headline)[0]
    label = result['label']
    score = result['score']
    return label, score

# Function to classify headline
def classify_headline(headline):
    doc = nlp(headline)
    matches = matcher(doc)

    if matches:
        return "Top Picks"
    
    category = None
    # Sentiment analysis
    label, score = transformer_sentiment_analysis(headline)
    
    if label == "POSITIVE":
        for token in doc:
            if token.lemma_ in positive_keywords and any([ent.label_ == "ORG" for ent in doc.ents]):
                return "Positive News"
    elif label == "NEGATIVE":
        for token in doc:
            if token.lemma_ in negative_keywords and any([ent.label_ == "ORG" for ent in doc.ents]):
                return "Negative News"

    # Check for sectorial, recommendation, and deal keywords
    for token in doc:
        if token.lemma_ in sectorial_keywords:
            return "Sectorial News"
        elif token.lemma_ in recommendation_keywords:
            return "Stock Recommendations"
        elif token.lemma_ in deal_keywords:
            return "Deals"
    
    # Default to sentiment analysis result if no specific category matches
    if label == "POSITIVE":
        return "Positive News"
    elif label == "NEGATIVE":
        return "Negative News"
    
    return category

# Process headlines and classify
alerts = []
for headline in headlines:
    category = classify_headline(headline)
    if category:
        alerts.append((headline, category))

# Print alerts
for alert in alerts:
    print(f"Category: {alert[1]}\nHeadline: {alert[0]}\n")


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Category: Negative News
Headline: Tomato rates soar as Centre looks to south for new stock

Category: Positive News
Headline: Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate



In [3]:
import pandas as pd
df = pd.read_csv("Data_for_semantic_analysis.csv",index_col = 0)
df.head()

Unnamed: 0,headline,news_category,org
0,'Will explore legal options': Kotak Bank probi...,Company Business News,Mint
1,ONGC Leads Nifty Gainers With 3% Gain As Marke...,,Benzinga India
2,SBI wants tax parity on bank FDs with mutual f...,,The Economic Times
3,"Rs 85,000 Crore Order Book: Railway Infra Comp...",,Dalal Street Investment Journal
4,Trade Setup For July 9: Investors Await Q1 Ear...,,NDTV Profit


In [5]:
alerts = []
for headline in df.headline:
    category = classify_headline(headline)
    if category:
        alerts.append((headline, category))

# Print alerts
for alert in alerts:
    print(f"Category: {alert[1]}\nHeadline: {alert[0]}\n")

Category: Positive News
Headline: 'Will explore legal options': Kotak Bank probing if Kingdon Capital hid Hindenburg Research links 

Category: Negative News
Headline: ONGC Leads Nifty Gainers With 3% Gain As Market Ends Flat On Monday 

Category: Negative News
Headline: SBI wants tax parity on bank FDs with mutual funds & equity markets in Budget 

Category: Negative News
Headline: Rs 85,000 Crore Order Book: Railway Infra Company Hits Fresh 52-Week High with Heavy Volumes; Stock Gains 370 per cent in Just 1 Year 

Category: Negative News
Headline: Trade Setup For July 9: Investors Await Q1 Earnings As Nifty Faces Key Resistance Levels 

Category: Negative News
Headline: Ola replaces Google Maps with in-house ‘Ola Maps’, announces API for developers 

Category: Negative News
Headline: Tata Motors Q1 update: Global wholesales increase 2% YoY; stock jumps 

Category: Positive News
Headline: Kia India Introduces New Variants For Seltos, Sonet 

Category: Negative News
Headline: Liver Doc

In [6]:
import textblob

In [7]:
textblob.Word("Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate")

'Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate'

In [8]:
import spacy

In [11]:
import nltk

In [1]:
from nltk.corpus import brown
brown.categories()
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance',
'science_fiction']
brown.words(categories='news')

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [3]:
import nltk

In [5]:
nltk.corpus.brown.words(categories = "news")

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [4]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('stocks')


af in is all health more now knowledge dark towns the said of that and
for possible this two date


In [6]:
import nltk
from nltk.corpus import brown

nltk.download('brown')


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\modza\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [7]:
news_texts = brown.words(categories='news')

In [8]:
print(news_texts[:100])


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]


In [9]:
news_text = ' '.join(news_texts)
print(news_text[:500])


The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place . The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted . The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to invest


In [10]:
news_sentences = brown.sents(categories='news')
for sentence in news_sentences[:5]:
    print(' '.join(sentence))


The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .
The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. .
`` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .
The jury said it did find that many of Georgia's registration and election laws `` are outmoded or inadequate and often ambiguous '' .


In [11]:
import nltk
from nltk.corpus import brown

# Download the Brown corpus
nltk.download('brown')

# Access the news category
news_texts = brown.words(categories='news')

# Print the first 100 words
print(news_texts[:100])

# Convert to a string and print the first 500 characters
news_text = ' '.join(news_texts)
print(news_text[:500])

# Get sentences instead of words
news_sentences = brown.sents(categories='news')
for sentence in news_sentences[:5]:
    print(' '.join(sentence))

# Tokenization
from nltk.tokenize import word_tokenize
news_words = word_tokenize(news_text)

# Frequency Distribution
from nltk.probability import FreqDist
fdist = FreqDist(news_words)
print(fdist.most_common(10))


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\modza\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place . The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted . The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to invest
The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .
The September-October term jury had been c

text = nltk.word_tokenize("Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate")
nltk.pos_tag(text)

In [15]:
import pandas as pd

In [16]:
import pandas as pd
df = pd.read_csv("Data_for_semantic_analysis.csv",index_col = 0)
df.head()

Unnamed: 0,headline,news_category,org
0,'Will explore legal options': Kotak Bank probi...,Company Business News,Mint
1,ONGC Leads Nifty Gainers With 3% Gain As Marke...,,Benzinga India
2,SBI wants tax parity on bank FDs with mutual f...,,The Economic Times
3,"Rs 85,000 Crore Order Book: Railway Infra Comp...",,Dalal Street Investment Journal
4,Trade Setup For July 9: Investors Await Q1 Ear...,,NDTV Profit


In [17]:
df.headline[0]

"'Will explore legal options': Kotak Bank probing if Kingdon Capital hid Hindenburg Research links "

In [18]:
import nltk
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('brown')

# Sample headlines


# Function to preprocess text
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans(, , string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

# Preprocess headlines
preprocessed_headlines = [preprocess(headline) for headline in df.headline]
print(preprocessed_headlines)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\modza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\modza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\modza\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!




In [19]:
from nltk.probability import FreqDist

# Access the news category from the Brown corpus
news_words = brown.words(categories='news')

# Preprocess Brown news words
preprocessed_news_words = [preprocess(word) for word in news_words]

# Flatten the list of lists
preprocessed_news_words = [item for sublist in preprocessed_news_words for item in sublist]

# Calculate frequency distribution for Brown news and your headlines
news_fdist = FreqDist(preprocessed_news_words)
headline_fdist = FreqDist([word for headline in preprocessed_headlines for word in df.headline])

# Compare the most common words
print("Most common words in Brown news corpus:", news_fdist.most_common(10))
print("Most common words in your headlines:", headline_fdist.most_common(10))


Most common words in Brown news corpus: [('said', 406), ('mrs', 254), ('would', 246), ('new', 241), ('one', 213), ('last', 177), ('two', 174), ('mr', 170), ('first', 158), ('state', 153)]
Most common words in your headlines: [('SJVN share price target 2024: Rs 1 lakh became Rs 3 lakh in 1 yr! Time to BUY PSU stock for more gains? ', 2158), ("Ratan Tata's company earns Rs 38000 crore in 5 days, becomes biggest... ", 2158), ('MCX technical glitch: Commodity trading begins at 10 AM after one hour delay ', 2158), ('Wheel falls off United Airlines Boeing moments after takeoff from LA airport: Watch ', 2158), ('Stocks to buy or sell: RVNL, RCF to Alembic — Sumeet Bagadia recommends five breakout stocks today ', 2158), ('BlackRock buys 4.55 mn shares of Swan Energy worth Rs 304 cr via block deal ', 2158), ('Zerodha glitch: Man explains how he lost Rs 10 lakh, seeks help ', 2158), ('Ducati Hypermotard 698 Mono Launch Price Rs 16.5 Lakh - RVE Arrival Later ', 2158), ('Trade setup for today: Top

In [20]:
from nltk import NaiveBayesClassifier, classify
from nltk.classify import apply_features

# Create labeled feature sets for training
def headline_features(words):
    return {word: True for word in words}

# Create labeled data from Brown corpus
brown_news = [(headline_features(brown.words(fileid)), 'news') for fileid in brown.fileids(categories='news')]

# Split into training and test sets
train_set = apply_features(headline_features, brown_news[:400])
test_set = apply_features(headline_features, brown_news[400:])

# Train Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Test classifier
accuracy = classify.accuracy(classifier, test_set)
print("Classifier accuracy:", accuracy)

# Classify new headlines
for headline in preprocessed_headlines:
    print(f"Headline: {' '.join(headline)}")
    print("Category:", classifier.classify(headline_features(headline)))


Classifier accuracy: 0
Headline: explore legal options kotak bank probing kingdon capital hid hindenburg research links
Category: news
Headline: ongc leads nifty gainers 3 gain market ends flat monday
Category: news
Headline: sbi wants tax parity bank fds mutual funds equity markets budget
Category: news
Headline: rs 85000 crore order book railway infra company hits fresh 52week high heavy volumes stock gains 370 per cent 1 year
Category: news
Headline: trade setup july 9 investors await q1 earnings nifty faces key resistance levels
Category: news
Headline: ola replaces google maps inhouse ‘ ola maps ’ announces api developers
Category: news
Headline: tata motors q1 update global wholesales increase 2 yoy stock jumps
Category: news
Headline: kia india introduces new variants seltos sonet
Category: news
Headline: liver doc slams nikhil kamath glorifying business alcohol podcast crime
Category: news
Headline: lt enhances renewables portfolio finalises mega orders solar plants middle east

In [22]:
from nltk.corpus import wordnet as wn
wn.synsets('motorcar')
#[Synset('car.n.01')]

[Synset('car.n.01')]

In [23]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [25]:
for i in wn.synsets('stocks'):
    print(wn.synset(i).lemma_names())

AttributeError: 'Synset' object has no attribute 'lower'

In [33]:
wn.synsets('shares')

[Synset('share.n.01'),
 Synset('share.n.02'),
 Synset('parcel.n.02'),
 Synset('contribution.n.01'),
 Synset('plowshare.n.01'),
 Synset('share.v.01'),
 Synset('share.v.02'),
 Synset('partake.v.02'),
 Synset('share.v.04'),
 Synset('share.v.05')]

In [36]:
wn.synset('contribution.n.01').lemma_names()

['contribution', 'part', 'share']

In [56]:
wn.synset('stock_certificate.n.01').definition()
#'a motor vehicle with four wheels; usually propelled by an internal combustion engine'
wn.synset('stock_certificate.n.01').examples()

['the value of his stocks doubled during the past year']

In [42]:
a = wn.synsets('stock')

In [53]:
wn.synset('stock.n.02').lemma_names()

['stock', 'inventory']

In [55]:
wn.synset('stock_certificate.n.01').lemma_names()

['stock_certificate', 'stock']

In [61]:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('stock'):
   print(ss, ss.definition())

Synset('stock.n.01') the capital raised by a corporation through the issue of shares entitling holders to an ownership interest (equity)
Synset('stock.n.02') the merchandise that a shop has on hand
Synset('stock.n.03') the handle of a handgun or the butt end of a rifle or shotgun or part of the support of a machine gun or artillery gun
Synset('stock_certificate.n.01') a certificate documenting the shareholder's ownership in the corporation
Synset('store.n.02') a supply of something available for future use
Synset('lineage.n.01') the descendants of one individual
Synset('breed.n.01') a special variety of domesticated animals within a species
Synset('broth.n.01') liquid in which meat and vegetables are simmered; used as a basis for e.g. soups or sauces
Synset('stock.n.09') the reputation and popularity a person has
Synset('stock.n.10') persistent thickened stem of a herbaceous perennial plant
Synset('stock.n.11') a plant or stem onto which a graft is made; especially a plant grown specif

In [64]:
print(lesk(sent, 'bank', 'n'))
#Synset('savings_bank.n.02')

Synset('savings_bank.n.02')


In [62]:
from nltk.wsd import lesk
sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.']

In [None]:
from nltk.wsd import lesk
sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.']

In [65]:
sent = [
    "Tomato rates soar as Centre looks to south for new stock",]
    #"Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate"
#]

In [66]:
print(lesk(sent, 'stock', 'n'))

Synset('store.n.02')


In [67]:
wn.synset('store.n.02').definition()

'a supply of something available for future use'

In [71]:
sent = [
    "Tomato rates soar as Centre looks to south for new stock",]
    "Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate"]
#]

In [76]:
lesk('John loves Mary'.split(), 'loves', synsets=[])

In [77]:
import nltk
from nltk.tokenize import word_tokenize

# List of sentences
sentences = [
    "Tomato rates soar as Centre looks to south for new stock",
    "Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate"
]

# Perform POS tagging
for sentence in sentences:
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    # Perform POS tagging
    pos_tags = nltk.pos_tag(tokens)
    # Print the results
    print(f"Sentence: {sentence}")
    print("POS Tags:", pos_tags)
    print()


Sentence: Tomato rates soar as Centre looks to south for new stock
POS Tags: [('Tomato', 'NNP'), ('rates', 'NNS'), ('soar', 'VBP'), ('as', 'IN'), ('Centre', 'NNP'), ('looks', 'VBZ'), ('to', 'TO'), ('south', 'VB'), ('for', 'IN'), ('new', 'JJ'), ('stock', 'NN')]

Sentence: Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate
POS Tags: [('Indian', 'JJ'), ('stock', 'NN'), ('market', 'NN'), (':', ':'), ('6', 'CD'), ('key', 'JJ'), ('things', 'NNS'), ('that', 'WDT'), ('changed', 'VBD'), ('for', 'IN'), ('market', 'NN'), ('overnight', 'JJ'), ('-', ':'), ('Gift', 'NN'), ('Nifty', 'NNP'), (',', ','), ('US', 'NNP'), ('Treasury', 'NNP'), ('yields', 'NNS'), ('to', 'TO'), ('gold', 'VB'), ('rate', 'NN')]



In [72]:
print(lesk(sent, 'stock', 'n'))

Synset('store.n.02')


In [73]:
wn.synset('store.n.02').definition()

'a supply of something available for future use'

In [78]:
import nltk
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()
finder = BigramCollocationFinder.from_words(
 nltk.corpus.genesis.words('english-web.txt'))
finder.nbest(bigram_measures.pmi, 10)


[('Allon', 'Bacuth'),
 ('Ashteroth', 'Karnaim'),
 ('Ben', 'Ammi'),
 ('En', 'Mishpat'),
 ('Jegar', 'Sahadutha'),
 ('Salt', 'Sea'),
 ('Whoever', 'sheds'),
 ('appoint', 'overseers'),
 ('aromatic', 'resin'),
 ('cutting', 'instrument')]

In [79]:
#text = "I do not like green eggs and ham, I do not like them Sam I am!"
for i in df.headline[:5]:
    tokens = nltk.wordpunct_tokenize(i)
    finder = BigramCollocationFinder.from_words(tokens)
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    print(sorted(bigram for bigram, score in scored))

[("'", 'Will'), ("':", 'Kotak'), ('Bank', 'probing'), ('Capital', 'hid'), ('Hindenburg', 'Research'), ('Kingdon', 'Capital'), ('Kotak', 'Bank'), ('Research', 'links'), ('Will', 'explore'), ('explore', 'legal'), ('hid', 'Hindenburg'), ('if', 'Kingdon'), ('legal', 'options'), ('options', "':"), ('probing', 'if')]
[('%', 'Gain'), ('3', '%'), ('As', 'Market'), ('Ends', 'Flat'), ('Flat', 'On'), ('Gain', 'As'), ('Gainers', 'With'), ('Leads', 'Nifty'), ('Market', 'Ends'), ('Nifty', 'Gainers'), ('ONGC', 'Leads'), ('On', 'Monday'), ('With', '3')]
[('&', 'equity'), ('FDs', 'with'), ('SBI', 'wants'), ('bank', 'FDs'), ('equity', 'markets'), ('funds', '&'), ('in', 'Budget'), ('markets', 'in'), ('mutual', 'funds'), ('on', 'bank'), ('parity', 'on'), ('tax', 'parity'), ('wants', 'tax'), ('with', 'mutual')]
[(',', '000'), ('-', 'Week'), ('000', 'Crore'), ('1', 'Year'), ('370', 'per'), ('52', '-'), ('85', ','), (':', 'Railway'), (';', 'Stock'), ('Book', ':'), ('Company', 'Hits'), ('Crore', 'Order'), ('F

In [81]:
nltk.corpus.r

ImportError: cannot import name 'news' from 'nltk.corpus' (C:\Users\modza\Documents\nlp\env\Lib\site-packages\nltk\corpus\__init__.py)

In [80]:
from nltk.corpus import brown, movie_reviews, reuters
print(brown.categories())
print(movie_reviews.categories())

print(reuters.categories())


['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
['neg', 'pos']
['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 's

In [1]:
import textblob

In [2]:
t = textblob.sentiments.PatternAnalyzer()

In [84]:
t.analyze("Indian stock market: 6 key things that changed for market overnight - Gift Nifty, US Treasury yields to gold rate")

Sentiment(polarity=0.0, subjectivity=1.0)

In [85]:
from nltk.corpus import brown

In [None]:
brown.sents(

In [86]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [None]:
import nltk
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()
finder = BigramCollocationFinder.from_words(
 nltk.corpus.genesis.words('english-web.txt'))
finder.nbest(bigram_measures.pmi, 10)


In [None]:
[('Allon', 'Bacuth'), ('Ashteroth', 'Karnaim'), ('Ben', 'Ammi'),
('En', 'Mishpat'), ('Jegar', 'Sahadutha'), ('Salt', 'Sea'),
('Whoever', 'sheds'), ('appoint', 'overseers'), ('aromatic', 'resin'),
('cutting', 'instrument')]

In [88]:
!git clone https://github.com/zafar0171/my_Realm


Cloning into 'my_Realm'...


In [89]:
!dir

 Volume in drive C has no label.
 Volume Serial Number is 8A9F-EF4E

 Directory of C:\Users\modza\Documents\nlp

21-07-2024  00:30    <DIR>          .
21-07-2024  00:30    <DIR>          ..
19-07-2024  19:35    <DIR>          .ipynb_checkpoints
18-01-2024  18:15           206,295 1. Text Classification - SMS.ipynb
30-01-2024  01:14            82,191 adobe.jpg
15-07-2024  16:07               123 corenlp_server-099517907f7a49a2.props
15-07-2024  13:35               123 corenlp_server-49648ec25afe4f13.props
15-07-2024  13:35               179 corenlp_server-d0c3292baa644e50.props
28-01-2024  02:07             2,063 dataset_infos.json
19-07-2024  19:34           115,932 Data_for_semantic_analysis.csv
18-01-2024  18:15           179,869 DocumentSimilarityClustering_v1.ipynb
17-01-2024  16:36    <DIR>          env
29-01-2024  01:32            12,489 extracted_text.csv
23-01-2024  12:39            62,533 Face_recog.ipynb
23-01-2024  12:47    <DIR>          flagged
17-07-2024  15:01           