In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn.pipeline import Pipeline
from scipy import stats
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, mean_squared_error, r2_score
from nltk.corpus import stopwords
import joblib

source_folder = '../data/twitter_data/'

In [2]:
def clean_data(DataFrame):
    stop_words = stopwords.words('english')
    stop_words.remove('not')
    stop_words.remove('no') 
    # remove stop words
    DataFrame['tweet_text'] = DataFrame['tweet_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    # remove punctuation
    DataFrame['tweet_text'] = DataFrame['tweet_text'].str.replace('[^\w\s]','')
    # remove numbers
    DataFrame['tweet_text'] = DataFrame['tweet_text'].str.replace('\d+', '')
    # remove http links
    DataFrame['tweet_text'] = DataFrame['tweet_text'].str.replace('http\S+|www.\S+', '', case=False)
    
    return DataFrame

In [3]:
train = pd.read_csv(source_folder+'/tweet_train.csv', sep=',')
test = pd.read_csv(source_folder+'/tweet_test.csv', sep=',')
new_data_test = pd.read_csv(source_folder+'/new_tweet_data_clean.csv', sep=',')
#test = clean_data(test)
# remove id column
#test = test.drop(['id'], axis=1)
# rename columns
#test = test.rename(columns={'tweet_text': 'Tweet', 'label': 'label'})
#new_tweet_data_clean = test.to_csv(source_folder+'/new_tweet_data_clean.csv', index=False)

In [4]:
train['Tweet'] = train['Tweet'].str.lower()
test['Tweet'] = test['Tweet'].str.lower()
new_data_test['Tweet'] = new_data_test['Tweet'].str.lower()

In [5]:
X_train = train['Tweet'].fillna(' ')
y_train = train['label'].fillna(' ')
X_test = test['Tweet'].fillna(' ')
y_test = test['label'].fillna(' ')
X_new = new_data_test['Tweet'].fillna(' ')
y_new = new_data_test['label'].fillna(' ')

In [6]:
pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True, analyzer='char', ngram_range=(1,5), min_df = 10, max_df = 0.95)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('clf', LogisticRegression(C=10, max_iter=1000))
])

In [10]:
# train the model
model = pipeline.fit(X_train, y_train)

# predict the test set
y_pred = pipeline.predict(X_test)

# save model and predictions
joblib.dump(model, '../saved_models/LR_model.pkl')
joblib.dump(y_pred, '../saved_models/LR_predictions.pkl')

#load model and predictions
model = joblib.load('../saved_models/LR_model.pkl')
y_pred = joblib.load('../saved_models/LR_predictions.pkl')


['../saved_models/LR_predictions.pkl']

In [9]:
print("AUC score: ", roc_auc_score(y_test, y_pred))
print("Accuracy : ", metrics.accuracy_score(y_test, y_pred))

AUC score:  0.8058879445314246
Accuracy :  0.805607476635514


In [10]:
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

Mean Squared Error:  0.19439252336448598
Coefficient of determination: 0.22


In [11]:
# predict the new data set
y_pred_new = pipeline.predict(X_new)

# save predictions
joblib.dump(y_pred_new, '../saved_models/LR_predictions_new.pkl')

print("AUC score: ", roc_auc_score(y_new, y_pred_new))
print("Accuracy : ", metrics.accuracy_score(y_new, y_pred_new))

AUC score:  0.7697676232064878
Accuracy :  0.8585807750377453
