### Tutorial from Earthly - Logistic Regression

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Define the function to clean the news title column
def cleaned_desc_column(text):
  # Remove commas
  text = re.sub(r',', '', text)
  # Remove extra spaces
  text = re.sub(r'\s+', ' ', text)
  # Remove full stops
  text = re.sub(r'\.', '', text)
  # Remove single quotes and double quotes
  text = re.sub(r"['\"]", '', text)
  # Remove other non-word characters
  text = re.sub(r'\W', ' ', text)

  text_token = word_tokenize(text)
  stop_words = set(stopwords.words('english'))

  filtered_text = []

  for sw in text_token:
    if sw not in stop_words:
        filtered_text.append(sw)

  text = " ".join(filtered_text)
  return text

[nltk_data] Downloading package punkt_tab to /Users/yty/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import pandas as pd
# df = pd.read_csv('data/balancednewcategory.csv')
df = pd.read_csv("data/400perMNDS.csv")

In [6]:
df.head()

Unnamed: 0,data_id,id,date,source,title,content,author,url,published,published_utc,collection_utc,category_level_1,category_level_2,PESTEL_label
0,202399,fortune--2019-02-26--Amazon Adds Former PepsiC...,2019-02-26,fortune,Amazon Adds Former PepsiCo Chief Indra Nooyi t...,Amazon has added former PepsiCo CEO Indra Nooy...,Erik Sherman,http://fortune.com/2019/02/26/amazon-board-ind...,2019-02-26 14:40:17+00:00,1551210017,1567547264,"economy, business and finance",business information,Economic
1,668391,theduran--2019-09-23--Pakistans Economic Crisis,2019-09-23,theduran,Pakistan’s Economic Crisis,Pakistan is going through financial woes. The ...,Serban V.C. Enache,https://theduran.com/pakistans-economic-crisis/,2019-09-23 07:49:51+00:00,1569239391,1570222447,"economy, business and finance",economic sector,Economic
2,1064343,unian--2019-09-06--Reuters Oil rises set for w...,2019-09-06,unian,"Reuters: Oil rises, set for weekly gain amid h...","Brent crude was up 17 cents, or 0.3%, at $61.1...",,https://www.unian.info/economics/10675431-reut...,2019-09-06 06:00:00+00:00,1567764000,1569331139,"economy, business and finance",market and exchange,Economic
3,308315,mediamattersforamerica--2019-04-29--New NRA Pr...,2019-04-29,mediamattersforamerica,New NRA President Carolyn Meadows chairs the b...,"Carolyn Meadows, who is succeeding Oliver Nort...",Media Matters for America,https://www.mediamatters.org/blog/2019/04/29/n...,2019-04-29 21:18:38+00:00,1556587118,1567541775,"economy, business and finance",business information,Economic
4,494786,sottnet--2019-01-13--Saudi energy minister cla...,2019-01-13,sottnet,Saudi energy minister claims oil market on 'ri...,Saudi Arabia's top energy official expects the...,,https://www.sott.net/article/404819-Saudi-ener...,2019-01-13 16:03:45+00:00,1547413425,1567552737,"economy, business and finance",market and exchange,Economic


In [7]:
# X = df['short_description']
# df['combined'] = df['headline'] + " " + df['short_description']
df['combined'] = df['title'] + " " + df['content']
X = df['combined']
y = df['PESTEL_label']

In [8]:
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = 0.30, random_state = 90)
print(X_train.shape)
print(X_test.shape)

(1680,)
(720,)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
# lr = Pipeline([('tfidf', TfidfVectorizer()),
#                ('clf', LogisticRegression(max_iter=1000)),])
lr = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2),
     min_df=3, max_df=0.9, stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])


# Train the logistic regression model on the training set
lr.fit(X_train, y_train)

In [10]:
# Make predictions on the test set
y_pred = lr.predict(X_test)

In [11]:
from sklearn.metrics import accuracy_score
# Calculate the accuracy of the model
print(f"Accuracy is: {accuracy_score(y_pred,y_test)}")

Accuracy is: 0.8319444444444445


In [12]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     Economic       0.86      0.83      0.84       115
Environmental       0.87      0.90      0.88       129
        Legal       0.89      0.89      0.89       129
    Political       0.74      0.77      0.75       111
       Social       0.87      0.80      0.84       130
Technological       0.75      0.78      0.77       106

     accuracy                           0.83       720
    macro avg       0.83      0.83      0.83       720
 weighted avg       0.83      0.83      0.83       720



In [13]:
news = ["Biden to Sign Executive Order That Aims to Make Child Care Cheaper",
       "Google Stock Loses $57 Billion Amid Microsoft's AI 'Lead'—And \
       Reports It Could Be Replaced By Bing On Some Smartphones",
       "Poland suspends food imports from Ukraine to assist its farmers",
       "Can AI Solve The Air Traffic Control Problem? Let's Find Out",
       "Woman From Odisha Runs 42.5 KM In UK Marathon Wearing A Saree",
       "Hillary Clinton: Trump cannot win the election - but Biden will",
       ]

predicted = lr.predict(news)

for doc, PESTEL_label in zip(news, predicted):
     print(PESTEL_label)

Social
Economic
Social
Technological
Social
Political
