In [1]:
# Step 1: Connect to Database and Fetch Data




In [2]:
import mysql.connector
import pandas as pd

# Connect to MySQL Database
db_connection = mysql.connector.connect(
    host="127.0.0.1",
    user="root",
    password="Positivity22#",  # Replace with your password
    database="supply_chain"
)

cursor = db_connection.cursor()

# Query to select the articles data
cursor.execute("SELECT title, description, published_at FROM articles")

# Fetch all rows from the database
rows = cursor.fetchall()

# Convert rows to a pandas DataFrame
df = pd.DataFrame(rows, columns=["title", "description", "publishedAt"])

# Check the data
print(df.head())


                                               title  \
0  Windward (LON:WNWD) Stock Price Up 0.4% – Here...   
1  The Le Creuset Sale: When Retail Hype Becomes ...   
2  Atria Investments Inc Sells 2,215 Shares of SP...   
3  What is Cormark’s Forecast for Cresco Labs FY2...   
4  Bitcoin Exchange Reserves Hit 5-Year Low—What ...   

                                         description         publishedAt  
0  Windward Ltd. (LON:WNWD – Get Free Report) sho... 2024-11-16 08:44:56  
1  Shoppers queued for hours in freezing conditio... 2024-11-16 08:44:19  
2  Atria Investments Inc lowered its position in ... 2024-11-16 08:43:02  
3  Cresco Labs Inc. (OTCMKTS:CRLBF – Free Report)... 2024-11-16 08:40:55  
4  The Bitcoin market appears to have taken an in... 2024-11-16 08:30:40  


In [3]:
# Step 2: Clean the Text Data

In [4]:
import re

# Function to clean the text (remove special characters, lowercase, etc.)
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Clean the 'description' column
df['cleaned_description'] = df['description'].apply(clean_text)

# Check the cleaned data
print(df.head())


                                               title  \
0  Windward (LON:WNWD) Stock Price Up 0.4% – Here...   
1  The Le Creuset Sale: When Retail Hype Becomes ...   
2  Atria Investments Inc Sells 2,215 Shares of SP...   
3  What is Cormark’s Forecast for Cresco Labs FY2...   
4  Bitcoin Exchange Reserves Hit 5-Year Low—What ...   

                                         description         publishedAt  \
0  Windward Ltd. (LON:WNWD – Get Free Report) sho... 2024-11-16 08:44:56   
1  Shoppers queued for hours in freezing conditio... 2024-11-16 08:44:19   
2  Atria Investments Inc lowered its position in ... 2024-11-16 08:43:02   
3  Cresco Labs Inc. (OTCMKTS:CRLBF – Free Report)... 2024-11-16 08:40:55   
4  The Bitcoin market appears to have taken an in... 2024-11-16 08:30:40   

                                 cleaned_description  
0  windward ltd lonwnwd  get free report shot up ...  
1  shoppers queued for hours in freezing conditio...  
2  atria investments inc lowered its posi

In [5]:
# Step 3: Apply Sentiment Analysis to Create Target Variable

In [6]:
from textblob import TextBlob

# Function to apply sentiment analysis and create 'sentiment' column
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity  # Range: -1 to 1

# Apply sentiment analysis to the cleaned descriptions
df['sentiment'] = df['cleaned_description'].apply(get_sentiment)

# Create synthetic target variable 'demand' based on sentiment (positive sentiment = high demand)
df['demand'] = df['sentiment'].apply(lambda x: 1 if x > 0 else 0)

# Check the resulting dataframe to ensure 'demand' is correctly created
print(df[['cleaned_description', 'sentiment', 'demand']].head())



                                 cleaned_description  sentiment  demand
0  windward ltd lonwnwd  get free report shot up ...   0.102500       1
1  shoppers queued for hours in freezing conditio...  -0.063333       0
2  atria investments inc lowered its position in ...   0.225000       1
3  cresco labs inc otcmktscrlbf  free report  equ...   0.400000       1
4  the bitcoin market appears to have taken an in...   0.400000       1


In [8]:
# Step 4: TF-IDF Vectorization

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # Limit to top 1000 features

# Convert the text to TF-IDF features
X_tfidf = vectorizer.fit_transform(df['cleaned_description'])

# Convert the TF-IDF matrix to a DataFrame for easier inspection
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
print("TF-IDF Matrix (First few rows):")
print(tfidf_df.head())


TF-IDF Matrix (First few rows):
   003  039        04   10  100  1069  107   11  113  11383  ...  worldwide  \
0  0.0  0.0  0.202172  0.0  0.0   0.0  0.0  0.0  0.0    0.0  ...        0.0   
1  0.0  0.0  0.000000  0.0  0.0   0.0  0.0  0.0  0.0    0.0  ...        0.0   
2  0.0  0.0  0.000000  0.0  0.0   0.0  0.0  0.0  0.0    0.0  ...        0.0   
3  0.0  0.0  0.000000  0.0  0.0   0.0  0.0  0.0  0.0    0.0  ...        0.0   
4  0.0  0.0  0.000000  0.0  0.0   0.0  0.0  0.0  0.0    0.0  ...        0.0   

   worst  worth  writi  writings   xi  year  years  ymbon  york  
0    0.0    0.0    0.0       0.0  0.0   0.0    0.0    0.0   0.0  
1    0.0    0.0    0.0       0.0  0.0   0.0    0.0    0.0   0.0  
2    0.0    0.0    0.0       0.0  0.0   0.0    0.0    0.0   0.0  
3    0.0    0.0    0.0       0.0  0.0   0.0    0.0    0.0   0.0  
4    0.0    0.0    0.0       0.0  0.0   0.0    0.0    0.0   0.0  

[5 rows x 1000 columns]


In [10]:
# Step 5: Train-Test Split


In [11]:
from sklearn.model_selection import train_test_split

# Prepare the features (X) and target (y)
X = tfidf_df  # Use TF-IDF features for the model
y = df['demand']  # Target variable based on sentiment

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the split
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")


Training set size: (910, 1000)
Test set size: (390, 1000)


In [12]:
# Step 6: Train the Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training set
clf.fit(X_train, y_train)

# Predict the target variable on the test set
y_pred = clf.predict(X_test)

# Print evaluation metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       178
           1       1.00      1.00      1.00       212

    accuracy                           1.00       390
   macro avg       1.00      1.00      1.00       390
weighted avg       1.00      1.00      1.00       390

