# SI 670 Kaggle 1

## 1. Read dataset

In [15]:
import pandas as pd
train = pd.read_csv('data/si670_kaggle1_train.csv')
valid = pd.read_csv('data/si670_kaggle1_validation.csv')
test = pd.read_csv('data/test.csv')

In [16]:
print("Train shape:", train.shape)
print("Validation shape:", valid.shape)
print("Test shape:", test.shape)

Train shape: (319071, 3)
Validation shape: (56792, 3)
Test shape: (60743, 2)


Now, we can have a look at the first few rows of the training data.

In [17]:
train.head()

Unnamed: 0,text,label,id
0,White girls very rarely date Asian men. Even i...,1,0
1,I am a 23 year old male Indian American male. ...,1,1
2,"Take three people, Persons A, B, and C. They l...",1,2
3,(A) Work part-time in high school; Then go to ...,1,3
4,When police introduce a new form of speed prev...,1,4


Next, we can check weather there exist any null values in the dataset.

In [18]:
train.isnull().sum()

text     0
label    0
id       0
dtype: int64

It seems like there are no nulls in the dataset.

In [19]:
ratio_train = train['label'].mean()
ratio_train

np.float64(0.2924678206418007)

In [20]:
ratio_valid = valid['label'].mean()
ratio_valid

np.float64(0.5070960698689956)

## 2. Data processing / text processing

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df= 0.00005)
tfidf_train = vectorizer.fit_transform(train['text'])
y_train = train['label']
tfidf_valid = vectorizer.transform(valid['text'])
y_valid = valid['label']
tfidf_test = vectorizer.transform(test['text'])

tfidf_train.shape

(319071, 439151)

try separate 1-gram and 2-gram, not using 3-gram

In [22]:
vectorizer_uni = TfidfVectorizer(
    ngram_range=(1,1),
    min_df=0.00005,
    max_df=0.8,
    stop_words='english'
)

tfidf_train_uni = vectorizer_uni.fit_transform(train['text'])
tfidf_valid_uni = vectorizer_uni.transform(valid['text'])
tfidf_test_uni = vectorizer_uni.transform(test['text'])

vectorizer_bi = TfidfVectorizer(
    ngram_range=(2,2),
    min_df=0.00005,
    max_df=0.8,
    stop_words='english'
)

tfidf_train_bi = vectorizer_bi.fit_transform(train['text'])
tfidf_valid_bi = vectorizer_bi.transform(valid['text'])
tfidf_test_bi = vectorizer_bi.transform(test['text'])

# vectorizer_tri = TfidfVectorizer(
#     ngram_range=(3,3),
#     min_df=0.00005,
#     max_df=0.8,
#     stop_words='english'
# )
# 
# tfidf_train_tri = vectorizer_bi.fit_transform(train['text'])
# tfidf_valid_tri = vectorizer_bi.transform(valid['text'])
# tfidf_test_tri = vectorizer_bi.transform(test['text'])

combine 1-gram and 2-gram to a single feature set

In [23]:
from scipy.sparse import hstack
tfidf_train_all = hstack([tfidf_train_uni, tfidf_train_bi])
tfidf_valid_all = hstack([tfidf_valid_uni, tfidf_valid_bi])
tfidf_test_all = hstack([tfidf_test_uni, tfidf_test_bi])
tfidf_train_all.shape

(319071, 285296)

train on all

In [24]:
df_all = pd.concat([train, valid], axis=0, ignore_index=True)
X_all_text = df_all['text']
y_all = df_all['label']
vectorizer_all = TfidfVectorizer(ngram_range=(1,2), min_df= 0.00005)
tfidf_all = vectorizer_all.fit_transform(X_all_text)
tfidf_valid = vectorizer_all.transform(valid['text'])
y_valid = valid['label']
tfidf_test = vectorizer_all.transform(test['text'])
tfidf_all.shape

(375863, 444252)

How to deal with high dimensionality?

In [25]:
# from sklearn.decomposition import TruncatedSVD

# vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.8, stop_words='english')
# tfidf_train = vectorizer.fit_transform(train['text'])
# tfidf_valid = vectorizer.transform(valid['text'])

# svd = TruncatedSVD(n_components=1000, random_state=670)
# tfidf_train_svd = svd.fit_transform(tfidf_train)
# tfidf_valid_svd = svd.transform(tfidf_valid)

# print("Original TF-IDF shape:", tfidf_train.shape)
# print("Reduced TF-IDF shape:", tfidf_train_svd.shape)


## 3. Train on Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model = LogisticRegression(penalty='l2',class_weight='balanced', max_iter=100, C=20, random_state=670, solver='liblinear')
model.fit(tfidf_all, y_all)
y_valid_pred_probas = model.predict_proba(tfidf_valid)
y_pred = (y_valid_pred_probas[:, 1] >= 0.5).astype(int)
print('F1 Score:', f1_score(y_valid, y_pred))

F1 Score: 0.9943469412603712


In [27]:
y_pred = (y_valid_pred_probas[:, 1] >= 0.34).astype(int)
print('F1 Score:', f1_score(y_valid, y_pred))

F1 Score: 0.9888128522917277


In [28]:
test_pred_probas = model.predict_proba(tfidf_test)
test_pred = (test_pred_probas[:, 1] >= 0.34).astype(int)
out_df = pd.DataFrame({'id': test['id'], 'label': test_pred})
out_df.to_csv('test_pred.csv', index=False)