# Transcription : NLP Getting Started Tutorial

[https://www.kaggle.com/philculliton/nlp-getting-started-tutorial](https://www.kaggle.com/philculliton/nlp-getting-started-tutorial)

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

In [14]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [15]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [16]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## A quick look at our data

In [17]:
train_df[train_df['target'] == 0]['text'].values[1]

'I love fruits'

In [18]:
train_df[train_df['target'] == 1]['text'].values[1]

'Forest fire near La Ronge Sask. Canada'

## Building vectors

In [19]:
count_vectorizer = CountVectorizer()

example_train_vectors = count_vectorizer.fit_transform(train_df['text'][0:5])

In [20]:
print(example_train_vectors[0].todense().shape)

(1, 54)


In [21]:
print(example_train_vectors[0].todense())

[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [22]:
train_vectors = count_vectorizer.fit_transform(train_df['text'])
test_vectors = count_vectorizer.transform(test_df['text'])

## Our model

In [23]:
clf = RidgeClassifier()

In [24]:
scores = cross_val_score(clf, train_vectors, train_df['target'],
                         cv=3,
                         scoring='f1')
scores

array([0.60355649, 0.57484457, 0.64485082])

In [25]:
clf.fit(train_vectors, train_df['target'])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [26]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [27]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [28]:
sample_submission['target'] = clf.predict(test_vectors)

In [29]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [None]:
sample_submission.to_csv("TN_001_submission.csv", index=False)