In [1]:
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Importing the train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# Taking a look at the train dataframe
print("Train Data:")
train.head()

Train Data:


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Peeking at the test dataframe
print("\nTest Data:")
test.head()


Test Data:


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
print("\nTrain Data Description:")
train.describe()


Train Data Description:


Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [6]:
# Checking for any nulls in the text and target columns
print("\nNull Counts in Train Data:")
train.isnull().sum()


Null Counts in Train Data:


id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
# How many 0s and 1s are there in the dataset?
print("\nClass Distribution in Train Data:")
train['target'].value_counts(normalize = True)


Class Distribution in Train Data:


target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [8]:
# Defining our X and y variables
X = train['text']
y = train['target']

In [9]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 115)

In [10]:
# Instantiating CountVectorizer
cvec = CountVectorizer()

In [11]:
# Fitting the vectorizer on the corpus
cvec.fit(X_train)

In [12]:
# Transforming the corpus
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

In [13]:
# Instantiating, fitting, and scoring the model
lr = LogisticRegression()
lr.fit(X_train_cv, y_train)
train_accuracy = lr.score(X_train_cv, y_train)
test_accuracy = lr.score(X_test_cv, y_test)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Training Accuracy: 0.9698721317218427
Test Accuracy: 0.8082983193277311


In [14]:
# Transforming the test corpus
test_cv = cvec.transform(test['text'])

In [15]:
# Using features to generate predictions
preds = lr.predict(test_cv)
print("\nPredictions:")
preds


Predictions:


array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [16]:
# Creating a dataframe with id and target
preds_df = pd.DataFrame({
    'id': test['id'],
    'target': preds
})

In [17]:
# Checking that the dataframe was created correctly
print("\nPredictions DataFrame:")
preds_df.head()


Predictions DataFrame:


Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [18]:
# Reviewing the dataframe with .describe()
print("\nSummary of Predictions DataFrame:")
preds_df.describe()


Summary of Predictions DataFrame:


Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.360405
std,3146.427221,0.480191
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,0.0
75%,8176.0,1.0
max,10875.0,1.0


In [19]:
# How many 0s and 1s are there in the dataset?
print("\nClass Distribution in Predictions:")
preds_df['target'].value_counts(normalize = True)


Class Distribution in Predictions:


target
0    0.639595
1    0.360405
Name: proportion, dtype: float64

In [20]:
# Generating a CSV
preds_df.to_csv('submission.csv', index = False)