<a href="https://colab.research.google.com/github/venkatesh-r96/Article_Types/blob/main/Article_Types.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [3]:
pip install transformers



In [4]:
from transformers import BertTokenizer, BertModel

In [5]:
encodings = ['utf-8', 'ISO-8859-1', 'cp1252', 'utf-16', 'latin1']

for encoding in encodings:
    try:
        df = pd.read_csv("/content/articles.csv", encoding=encoding)
        print("File read successfully with encoding:", encoding)
        break  # Exit the loop if successfully read
    except UnicodeDecodeError:
        print("Error with encoding:", encoding)


Error with encoding: utf-8
File read successfully with encoding: ISO-8859-1


In [6]:
df

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
0,d6995462-5e87-453b-b64d-e9f1df6e94d2,"A Puzzling Maneuver, Then Freefall: NTSB Repor...",,Essex Caller,<p>The helicopter that crashed in Southeast Al...,<p>The helicopter that crashed in Southeast Al...,Commercial,Negative
1,8b05e939-a89e-4548-b92b-013822e8ee7d,Bells Nexus Air Taxi Concept Rings Changes Fo...,,Aviation Week Network,<p>A year after teasing the fledgling electric...,<p>A year after teasing the fledgling electric...,Commercial,Positive
2,69fcd400-bceb-4255-8277-619f2d68ac0b,Bell Helicopter Show Air Taxi Nexus,http://images.tmtpost.com/uploads/images/2019/...,TMTPost,<p>Bell released the full-size design of the v...,<p>Bell released the full-size design of the v...,Commercial,Positive
3,17943578-c11b-414b-b3f5-063d3a93157b,BELL DÉVOILE LA CONCEPTION INTÉGRALE DE SON TA...,http://www.fredzone.org/wp-content/uploads/201...,Fredzone,<p>Bell est une soci&eacute;t&eacute; am&eacut...,<p>Bell est une soci&eacute;t&eacute; am&eacut...,Commercial,Positive
4,f33c7b11-5f77-4a98-bb2e-d36689042aea,Les premiers retours dOlivier Ezratty,,FrenchWeb,<p>It was still anecdotal to observe the explo...,<p>It was still anecdotal to observe the explo...,Commercial,Positive
...,...,...,...,...,...,...,...,...
4300,a40e5017-0a38-4d06-bcbe-616b73456c94,"Chinook Catches Army Flirting With Younger, Th...",https://www.duffelblog.com/wp-content/uploads/...,Duffel Blog,Long rumored tensions came to a head as the CH...,<p>Long rumored tensions came to a head as the...,Military,
4301,782ba519-bcb0-4ef1-873d-713a18b04576,Lufthansa Aviation Selects Reiser Simulation H...,,Vertical,<p>In the course of upcoming investments in ne...,<p>In the course of upcoming investments in ne...,Commercial,Positive
4302,deb31e5d-15c0-4c1d-843c-ce02e9081746,This Bell Flight Drone Wont Be Delivering Pizza,,"WFAA-TV ABC (Dallas, TX)","<p>At Bell Flight in Fort Worth, engineers are...","<p>At Bell Flight in Fort Worth, engineers are...",Military,Positive
4303,f7125b1d-a687-469b-a799-c8cb4443b1d1,Blade Offers New York Airport Transfers for $1...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,Bloomberg,<p>Getting to this price point took about five...,<p>Getting to this price point took about five...,Commercial,Positive


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4305 entries, 0 to 4304
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Id                    4305 non-null   object
 1   Heading               4305 non-null   object
 2   Article.Banner.Image  1753 non-null   object
 3   Outlets               4305 non-null   object
 4   Article.Description   4305 non-null   object
 5   Full_Article          4305 non-null   object
 6   Article_Type          4305 non-null   object
 7   Tonality              3873 non-null   object
dtypes: object(8)
memory usage: 269.2+ KB


In [8]:
# Checking for null values
df.isnull().sum()

Id                         0
Heading                    0
Article.Banner.Image    2552
Outlets                    0
Article.Description        0
Full_Article               0
Article_Type               0
Tonality                 432
dtype: int64

In [9]:
df.drop(['Id','Article.Banner.Image','Outlets'], axis= 1, inplace = True)
df

Unnamed: 0,Heading,Article.Description,Full_Article,Article_Type,Tonality
0,"A Puzzling Maneuver, Then Freefall: NTSB Repor...",<p>The helicopter that crashed in Southeast Al...,<p>The helicopter that crashed in Southeast Al...,Commercial,Negative
1,Bells Nexus Air Taxi Concept Rings Changes Fo...,<p>A year after teasing the fledgling electric...,<p>A year after teasing the fledgling electric...,Commercial,Positive
2,Bell Helicopter Show Air Taxi Nexus,<p>Bell released the full-size design of the v...,<p>Bell released the full-size design of the v...,Commercial,Positive
3,BELL DÉVOILE LA CONCEPTION INTÉGRALE DE SON TA...,<p>Bell est une soci&eacute;t&eacute; am&eacut...,<p>Bell est une soci&eacute;t&eacute; am&eacut...,Commercial,Positive
4,Les premiers retours dOlivier Ezratty,<p>It was still anecdotal to observe the explo...,<p>It was still anecdotal to observe the explo...,Commercial,Positive
...,...,...,...,...,...
4300,"Chinook Catches Army Flirting With Younger, Th...",Long rumored tensions came to a head as the CH...,<p>Long rumored tensions came to a head as the...,Military,
4301,Lufthansa Aviation Selects Reiser Simulation H...,<p>In the course of upcoming investments in ne...,<p>In the course of upcoming investments in ne...,Commercial,Positive
4302,This Bell Flight Drone Wont Be Delivering Pizza,"<p>At Bell Flight in Fort Worth, engineers are...","<p>At Bell Flight in Fort Worth, engineers are...",Military,Positive
4303,Blade Offers New York Airport Transfers for $1...,<p>Getting to this price point took about five...,<p>Getting to this price point took about five...,Commercial,Positive


In [10]:
import re

# Define a function to clean and preprocess a heading
def clean_heading(heading):
    # Remove special characters, punctuation, and non-alphanumeric characters
    heading = re.sub(r'[^a-zA-Z0-9\s]', '', heading)
    # Convert to lowercase
    heading = heading.lower()
    # Remove extra whitespace
    heading = ' '.join(heading.split())
    return heading

# Apply the clean_heading function to the 'Heading' column in your DataFrame
df['Heading'] = df['Heading'].apply(clean_heading)

# Display the cleaned 'Heading' column
print(df['Heading'])


0       a puzzling maneuver then freefall ntsb report ...
1       bells nexus air taxi concept rings changes for...
2                     bell helicopter show air taxi nexus
3       bell dvoile la conception intgrale de son taxi...
4                   les premiers retours dolivier ezratty
                              ...                        
4300    chinook catches army flirting with younger thi...
4301    lufthansa aviation selects reiser simulation h...
4302      this bell flight drone wont be delivering pizza
4303    blade offers new york airport transfers for 19...
4304                    us little birds flying to lebanon
Name: Heading, Length: 4305, dtype: object


In [11]:
# Define a function to clean and preprocess text
def clean_text(text):
    # Remove special characters, punctuation, and non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply the clean_text function to the 'Article.Description' and 'Full_Article' columns
df['Article.Description'] = df['Article.Description'].apply(clean_text)
df['Full_Article'] = df['Full_Article'].apply(clean_text)

# Display the cleaned 'Article.Description' and 'Full_Article' columns
print(df['Article.Description'])
print(df['Full_Article'])


0       pthe helicopter that crashed in southeast alas...
1       pa year after teasing the fledgling electric v...
2       pbell released the fullsize design of the vert...
3       pbell est une socieacuteteacute ameacutericain...
4       pit was still anecdotal to observe the explosi...
                              ...                        
4300    long rumored tensions came to a head as the ch...
4301    pin the course of upcoming investments in new ...
4302    pat bell flight in fort worth engineers are wo...
4303    pgetting to this price point took about five y...
4304    lebanon is to receive a new 120 million us mil...
Name: Article.Description, Length: 4305, dtype: object
0       pthe helicopter that crashed in southeast alas...
1       pa year after teasing the fledgling electric v...
2       pbell released the fullsize design of the vert...
3       pbell est une socieacuteteacute ameacutericain...
4       pit was still anecdotal to observe the explosi...
                 

In [12]:
df['Tonality'].value_counts()

Positive    3286
Negative     331
Neutral      256
Name: Tonality, dtype: int64

In [13]:
df['Article_Type'].value_counts()

Commercial            2470
Military              1677
Executives              65
Others                  52
Support & Services      26
Financing                9
Training                 6
Name: Article_Type, dtype: int64

In [14]:
df['Article_Type'] = df['Article_Type'].map({'Commercial':6, "Military": 5,"Executives": 4,"Others ": 3,"Support & Services": 2,"Financing": 1,"Training":0})

In [15]:
# Calculate the mode of the 'Tonality' column
mode_value = df['Tonality'].mode()[0]  # Assuming mode returns a single value

# Fill null values with the mode value
df['Tonality'].fillna(mode_value, inplace=True)


In [16]:
df['Tonality'] = df['Tonality'].map({'Positive':2, "Negative": 1,"Neutral": 0})

In [17]:
df['Tonality'].value_counts()

2    3718
1     331
0     256
Name: Tonality, dtype: int64

In [18]:
df.columns


Index(['Heading', 'Article.Description', 'Full_Article', 'Article_Type',
       'Tonality'],
      dtype='object')

In [19]:
df.dtypes

Heading                 object
Article.Description     object
Full_Article            object
Article_Type           float64
Tonality                 int64
dtype: object

In [20]:
pip install sentence-transformers



In [None]:
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd


# Combine the text columns into a single 'Combined_Text' column
df['Combined_Text'] = df['Heading'] + ' ' + df['Article.Description'] + ' ' + df['Full_Article']

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Tokenize the text data
tokenized_text = df['Combined_Text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Padding and truncating tokens (you may need to adjust max length based on your data)
max_length = 128
padded_text = torch.tensor([t[:max_length] + [0] * (max_length - len(t[:max_length])) for t in tokenized_text])

# Generate BERT embeddings
with torch.no_grad():
    embeddings = model(padded_text)

# 'embeddings' now contains BERT embeddings for your text data


Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [None]:
from sklearn.model_selection import train_test_split

# Split your data into training and testing sets
X = embeddings  # Use the BERT embeddings
y = df['Article_Type']  # Your target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.svm import SVC

# Create an SVM classifier (you can choose other classifiers as needed)
classifier = SVC()


In [None]:
# Train the classifier
classifier.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred)
print(report)


In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)

print(report)
