### Tutorial from Earthly - Logistic Regression

In [30]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Define the function to clean the news title column
def cleaned_desc_column(text):
  # Remove commas
  text = re.sub(r',', '', text)
  # Remove extra spaces
  text = re.sub(r'\s+', ' ', text)
  # Remove full stops
  text = re.sub(r'\.', '', text)
  # Remove single quotes and double quotes
  text = re.sub(r"['\"]", '', text)
  # Remove other non-word characters
  text = re.sub(r'\W', ' ', text)

  text_token = word_tokenize(text)
  stop_words = set(stopwords.words('english'))

  filtered_text = []

  for sw in text_token:
    if sw not in stop_words:
        filtered_text.append(sw)

  text = " ".join(filtered_text)
  return text

[nltk_data] Downloading package punkt_tab to /Users/yty/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
import pandas as pd
df = pd.read_csv('data/balancednewcategory.csv')

In [32]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,PESTEL_label
0,https://www.huffingtonpost.com/entry/mortgage-...,Mortgage Deal Reached In 2008 Shows Pitfalls T...,BUSINESS,"The Obama administration, which is pushing sta...",Loren Berlin,2012-02-05,Economic
1,https://www.huffingtonpost.com/entry/women-in-...,"Women in Business: Kate O'Brien Minson, Presid...",BUSINESS,Kate has lived and breathed the therapeutic ap...,"Laura Dunn, ContributorSocial Media and Commun...",2015-04-25,Economic
2,https://www.huffingtonpost.com/entry/like-athl...,"Like Athletes, Business Owners Need to Learn F...",BUSINESS,"Business owners and top executives can also ""w...","Mary Ellen Biery, ContributorResearch Speciali...",2015-01-19,Economic
3,https://www.huffingtonpost.com/entry/donald-tr...,Trump Could Trigger The Longest Recession Sinc...,BUSINESS,Yikes.,Ben Walsh,2016-06-27,Economic
4,https://www.huffingtonpost.com/entry/grocery-c...,Grocery Chains Made A Promise To The First Lad...,BUSINESS,An AP investigation found that major grocers o...,"Mike Schneider, AP",2015-12-07,Economic


In [39]:
# X = df['short_description']
df['combined'] = df['headline'] + " " + df['short_description']
X = df['combined']
y = df['PESTEL_label']

In [40]:
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = 0.30, random_state = 90)
print(X_train.shape)
print(X_test.shape)

(11760,)
(5040,)


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
# lr = Pipeline([('tfidf', TfidfVectorizer()),
#                ('clf', LogisticRegression(max_iter=1000)),])
lr = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2),
     min_df=3, max_df=0.9, stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])


# Train the logistic regression model on the training set
lr.fit(X_train, y_train)

In [48]:
# Make predictions on the test set
y_pred = lr.predict(X_test)

In [49]:
from sklearn.metrics import accuracy_score
# Calculate the accuracy of the model
print(f"Accuracy is: {accuracy_score(y_pred,y_test)}")

Accuracy is: 0.7549603174603174


In [51]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     Economic       0.72      0.71      0.72       858
Environmental       0.77      0.76      0.77       809
        Legal       0.84      0.84      0.84       889
    Political       0.79      0.75      0.77       848
       Social       0.66      0.74      0.70       796
Technological       0.75      0.72      0.73       840

     accuracy                           0.75      5040
    macro avg       0.76      0.75      0.75      5040
 weighted avg       0.76      0.75      0.76      5040



In [52]:
news = ["Biden to Sign Executive Order That Aims to Make Child Care Cheaper",
       "Google Stock Loses $57 Billion Amid Microsoft's AI 'Lead'—And \
       Reports It Could Be Replaced By Bing On Some Smartphones",
       "Poland suspends food imports from Ukraine to assist its farmers",
       "Can AI Solve The Air Traffic Control Problem? Let's Find Out",
       "Woman From Odisha Runs 42.5 KM In UK Marathon Wearing A Saree",
       "Hillary Clinton: Trump cannot win the election - but Biden will",
       ]

predicted = lr.predict(news)

for doc, PESTEL_label in zip(news, predicted):
     print(PESTEL_label)

Political
Technological
Social
Technological
Social
Political
