# IEMS5780 - Assignment 2

1155169095
Yang Xinyi

In [3]:
# Imports
import glob
import pandas as pd
import csv

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

## Data Preprocessing (20%)

● Download the data source, news.csv. 
● Split the dataset to 80% training set and 20% testing set. 
● Check and report the ratio of real-to-fake news are roughly the same in both training and testing sets.

In [12]:
def combine(dataset_path, is_shuffle=False, save_path=None):
    
    
    # 1. Download the data source, news.csv.
    
    print('Date pre-processing...')
    data = []           
    csv_file = open(dataset_path + 'news.csv', 'r', encoding='utf-8')
    reader = csv.DictReader(csv_file)
    real,fake = 1, 1
    for row in reader:
        if row['label'] == 'REAL':
            real += 1
            row['label'] = 1 # change REAL to 0
        else:
            fake += 1
            row['label'] = 0
        row['text'] = row['title'] + row['text'] # append title with text
        row.pop('title')
        row.pop('')
        data += [row]
    # Load datalist into DataFrame
    df = pd.DataFrame(data, columns=['text', 'label'])
    # Shuffle
    if is_shuffle:
        df = df.sample(frac=1)
        
        
    # 2. Split the dataset to 80% training set and 20% testing set.
    
    df_train, df_test = train_test_split(df, test_size=0.2)

    # Save DataFrame to csv file.
    if save_path is not None:
        with open(save_path + 'train.csv', 'w', encoding='utf8') as f:
            df_train.to_csv(f)
        with open(save_path + 'test.csv', 'w', encoding='utf8') as f:
            df_test.to_csv(f)
            
            
    # 3. Check and report the ratio of real-to-fake news.
    
    # ratio of real-to-fake: 1.0022116903633491. 
    # Thus the ratio of real-to-fake news are roughly the same in both training and testing sets
    print('real num: ' + str(real))
    print('fake num: ' + str(fake))
    print('ratio of real-to-fake: ' + str( real / fake ))
            
    return df_train, df_test

# Data preprocessing Please fill your path of dataset and output file.
filepath = 'C:\\Users\\a\\Desktop\\IEMS5780A2\\Assignment1\\'
print(filepath)
train, test = combine(filepath, True, None)
print(train.head(3))

C:\Users\a\Desktop\IEMS5780A2\Assignment1\
Date pre-processing...
real num: 3172
fake num: 3165
ratio of real-to-fake: 1.0022116903633491
                                                   text  label
4941  Clinton regrets 1996 remark on ‘super-predator...      1
4066  The Next Big Shoe to DropThe Next Big Shoe to ...      0
27    Donald Trump Is Changing His Campaign Slogan t...      1


## 3. Training Logistic Regression Models with Adding Bi-Grams to the Model (30%)

● Prepare pipeline building up using sklearn’s CounterVectorizer and TfidfVectorizer.
● Add bigram in both vectorizers. 
● Train logistic regression classifiers using the training set. 
● Compute (i) accuracy, (ii) precision and (iii) recall based on the testing set. 
● Save your models in a .pkl file using joblib.

In [15]:
from joblib import dump
def logistic_regression_count_bigram(train, test, path):
    print('Training Logistic Regression model with biigram CountVectorize...')
    # Extract documents and labels.
    
    docs_train = train['text']
    labels_train = train['label']
    docs_test = test['text']
    labels_test = test['label']
    
    # 1. Prepare pipeline building up using sklearn’s CounterVectorizer and TfidfVectorizer. Add bigram in both vectorizers. 
    model = Pipeline([
        ('vec', CountVectorizer(ngram_range=(1,2))),
        ('log', LogisticRegression(max_iter=5000))
    ])
    
    
    # 3. Train logistic regression classifiers using the training set. 
    model.fit(docs_train, labels_train)
    
    
    # 4. Compute (i) accuracy, (ii) precision and (iii) recall based on the testing set. 
    y_pred = model.predict(docs_test)
    print(classification_report(labels_test, y_pred))
    
    
    # 5. Save your models in a .pkl file using joblib.
    dump(model, path)
    
    
logistic_regression_count_bigram(train, test, 'model1.pkl')

Training Logistic Regression model with biigram CountVectorize...
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       628
           1       0.96      0.94      0.95       639

    accuracy                           0.95      1267
   macro avg       0.95      0.95      0.95      1267
weighted avg       0.95      0.95      0.95      1267



In [14]:
from joblib import dump
def logistic_regression_tfidf_bigram(train, test, path):
    print('Training Logistic Regression model with biigram CountVectorize...')
    # Extract documents and labels.
    
    docs_train = train['text']
    labels_train = train['label']
    docs_test = test['text']
    labels_test = test['label']
    
    # 1. Prepare pipeline building up using sklearn’s TfidfVectorizer. Add bigram in both vectorizers. 
    model = Pipeline([
    ('vec', TfidfVectorizer()),
    ('log', LogisticRegression(max_iter=5000))
    ])
    
    
    # 3. Train logistic regression classifiers using the training set. 
    model.fit(docs_train, labels_train)
    
    
    # 4. Compute (i) accuracy, (ii) precision and (iii) recall based on the testing set. 
    y_pred = model.predict(docs_test)
    print(classification_report(labels_test, y_pred))
    
    
    # 5. Save your models in a .pkl file using joblib.
    dump(model, path)
    
    
logistic_regression_tfidf_bigram(train, test, 'model2.pkl')

Training Logistic Regression model with biigram CountVectorize...
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       628
           1       0.93      0.91      0.92       639

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267

