In [43]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from textblob import TextBlob
import scipy.sparse

In [44]:
# Load and preprocess the data
df = pd.read_csv('Data.csv', encoding='ISO-8859-1')

In [45]:
# Split data into training and testing sets based on date
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231']

In [46]:
# Prepare text data - concatenate headlines into a single string for each row
train_headlines = [' '.join(str(x) for x in train.iloc[row, 2:27]) for row in range(len(train))]
test_headlines = [' '.join(str(x) for x in test.iloc[row, 2:27]) for row in range(len(test))]


In [47]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.9)
X_train = vectorizer.fit_transform(train_headlines)
X_test = vectorizer.transform(test_headlines)

In [48]:
# Calculate sentiment polarity using TextBlob for each headline and add as a feature
train_sentiment = [TextBlob(headline).sentiment.polarity for headline in train_headlines]
test_sentiment = [TextBlob(headline).sentiment.polarity for headline in test_headlines]


In [49]:
# Stack sentiment features with TF-IDF features
X_train = scipy.sparse.hstack((X_train, scipy.sparse.csr_matrix(train_sentiment).T))
X_test = scipy.sparse.hstack((X_test, scipy.sparse.csr_matrix(test_sentiment).T))

In [50]:
# Extract labels for training and testing
y_train = train['Label']
y_test = test['Label']

In [51]:
# Initialize and train the XGBoost classifier
xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)


In [52]:
# Make predictions on the test set
predictions = xgb_model.predict(X_test)

In [53]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

# Display the results
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.8412698412698413
Confusion Matrix:
[[152  34]
 [ 26 166]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       186
           1       0.83      0.86      0.85       192

    accuracy                           0.84       378
   macro avg       0.84      0.84      0.84       378
weighted avg       0.84      0.84      0.84       378



In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Example training setup
vectorizer = TfidfVectorizer(max_features=5001)  # or the exact parameters used
X_train_transformed = vectorizer.fit_transform(train_headlines)  # Fit on training data

# Train your XGBoost model
model = XGBClassifier()
model.fit(X_train_transformed, y_train)

# Save both the model and vectorizer
joblib.dump(model, "modelx.joblib")
joblib.dump(vectorizer, "vectorizerx.joblib")


['vectorizerx.joblib']

In [55]:
# import joblib
# joblib.dump(xgb_model, 'model2.joblib')
# joblib.dump(vectorizer, 'vectorizer2.joblib')

In [56]:
# import pandas as pd
# import joblib

# # Load the saved model and CountVectorizer
# model = joblib.load('model2.joblib')
# countvector = joblib.load('vectorizer2.joblib')

# # Create the new headline data
# new_headline_data = pd.DataFrame({
#     'Date': ['2000-02-02'],
#     'Top1': ["Hospital case"],
#     'Top2': ["English women fall apart in third defeat"],
#     'Top3': ["Kick flicks are coming soon to a cinema near you"],
#     'Top4': ["Italy orders anti-fascist snatch squads at Lazio"],
#     'Top5': ["Specialist consigns Owen to month on the sidelines"],
#     'Top6': ["Leboeuf banned as Vialli plots comeback"],
#     'Top7': ["Debts force Airdrie into liquidation"],
#     'Top8': ["Housing: Where the heart is"],
#     'Top9': ["Zimbabwe visit under threat"],
#     'Top10': ["Bates blasts 'meddling' ministers"],
#     'Top11': ["The farmer's in his den"],
#     'Top12': ["PM tells farmers to be flexible"],
#     'Top13': ["Hague's big gamble"],
#     'Top14': ["Win tickets to the Fifth Element and Dan Weil Q&A"],
#     'Top15': ["Lions lie in wait for their hobbling hosts"],
#     'Top16': ["Redwood 'shocked' at sacking"],
#     'Top17': ["Germany and France condemn Haider"],
#     'Top18': ["Deserving a chance, Denying a rumour and Delivering a statement"],
#     'Top19': ["Lawyer's record payout after race bias at CPS"],
#     'Top20': ["Race team joins hanging inquiry into Telford deaths"],
#     'Top21': ["Father of hanged Telford man dismisses race murder claims"],
#     'Top22': ["Shock exchange"],
#     'Top23': ["Blair peace plea as talks continue"],
#     'Top24': ["Haider: son of Nazis who defies labels"],
#     'Top25': ["Austria defies isolation threats"],
#     'Top26': ["America defies isolation threats"]
# })

# # Preprocess the new data: Combine all headlines into a single string
# combined_headline = ' '.join(new_headline_data.iloc[0, 1:].str.lower())

# # Transform the new data using the loaded CountVectorizer
# new_data_transformed = countvector.transform([combined_headline])

# # Calculate sentiment polarity for the new headline
# new_sentiment = TextBlob(combined_headline).sentiment.polarity

# # Stack the sentiment feature with the transformed data
# new_data_with_sentiment = scipy.sparse.hstack((new_data_transformed, scipy.sparse.csr_matrix([new_sentiment]).T))

# # Use the model to make predictions on the new data
# prediction = model.predict(new_data_with_sentiment)

# # Display the prediction
# print("Prediction for new data:", prediction[0])  # Output will be 0 or 1 based on the model's training


In [59]:
import pandas as pd
import joblib
from scipy import sparse

# Load the trained model and vectorizer
model = joblib.load("modelx.joblib")  # path to your trained model
vectorizer = joblib.load("vectorizerx.joblib")  # path to your TF-IDF vectorizer

# Define new headline data
new_headline_data = pd.DataFrame({
    'Date': ['2000-02-02'],
    'Top1': ["Hospital case"],
    'Top2': ["English women fall apart in third defeat"],
    'Top3': ["Kick flicks are coming soon to a cinema near you"],
    'Top4': ["Italy orders anti-fascist snatch squads at Lazio"],
    'Top5': ["Specialist consigns Owen to month on the sidelines"],
    'Top6': ["Leboeuf banned as Vialli plots comeback"],
    'Top7': ["Debts force Airdrie into liquidation"],
    'Top8': ["Housing: Where the heart is"],
    'Top9': ["Zimbabwe visit under threat"],
    'Top10': ["Bates blasts 'meddling' ministers"],
    'Top11': ["The farmer's in his den"],
    'Top12': ["PM tells farmers to be flexible"],
    'Top13': ["Hague's big gamble"],
    'Top14': ["Win tickets to the Fifth Element and Dan Weil Q&A"],
    'Top15': ["Lions lie in wait for their hobbling hosts"],
    'Top16': ["Redwood 'shocked' at sacking"],
    'Top17': ["Germany and France condemn Haider"],
    'Top18': ["Deserving a chance, Denying a rumour and Delivering a statement"],
    'Top19': ["Lawyer's record payout after race bias at CPS"],
    'Top20': ["Race team joins hanging inquiry into Telford deaths"],
    'Top21': ["Father of hanged Telford man dismisses race murder claims"],
    'Top22': ["Shock exchange"],
    'Top23': ["Blair peace plea as talks continue"],
    'Top24': ["Haider: son of Nazis who defies labels"],
    'Top25': ["Austria defies isolation threats"],
    'Top26': ["America defies isolation threats"]
})

# Concatenate all headlines into a single text block for processing
combined_text = " ".join(new_headline_data.iloc[0, 1:])

# Transform the text with the TF-IDF vectorizer
transformed_data = vectorizer.transform([combined_text])

# Convert sparse matrix to dense format if necessary
if sparse.issparse(transformed_data):
    transformed_data = transformed_data.toarray()

# Make prediction using the loaded model
try:
    prediction = model.predict(transformed_data)[0]
    print("Prediction:", prediction)
except ValueError as e:
    print(f"Prediction error: {e}")
    print("Input data shape:", transformed_data.shape)


Prediction: 0
