In [112]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [113]:
df = pd.read_csv("/content/train.csv")
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27476,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27478,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


**Drop "neutral" values from the dataset**

In [114]:
df = df.replace({'sentiment': ['neutral']}, np.nan)

# Drop the rows that contain np.nan
df = df.dropna()

In [115]:
df_negative = df['sentiment'] == 'negative'
negative_tweets = df.loc[df_negative]
negative_tweets

Unnamed: 0,textID,text,selected_text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
12,74a76f6e0a,My Sharpie is running DANGERously low on ink,DANGERously,negative
...,...,...,...,...
27463,e7eecffdc8,rec game....trying not to cry...the pain is to...,breakingg,negative
27469,778184dff1,lol i know and haha..did you fall asleep?? o...,t bored,negative
27471,8f5adc47ec,http://twitpic.com/663vr - Wanted to visit the...,were too late,negative
27475,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative


In [116]:
df_positive = df['sentiment'] == 'positive'
positive_tweets = df.loc[df_positive]
positive_tweets

Unnamed: 0,textID,text,selected_text,sentiment
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive
11,16fab9f95b,I really really like the song Love Story by Ta...,like,positive
21,e48b0b8a23,Playing Ghost Online is really interesting. Th...,interesting.,positive
25,e00c6ef376,"the free fillin` app on my ipod is fun, im add...","the free fillin` app on my ipod is fun, im add...",positive
...,...,...,...,...
27466,432e6de6c9,morning twit-friends! welcome to my new followers,welcome,positive
27473,8f14bb2715,So I get up early and I feel good about the da...,I feel good ab,positive
27474,b78ec00df5,enjoy ur night,enjoy,positive
27477,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive


 **What is the percentage of positive/negative tweets?**

In [117]:
# Calculate the percentage of positive tweets
percent_positive = (len(positive_tweets) / len(df))*100

# Calculate the percentage of negative tweets
percent_negative = (len(negative_tweets) / len(df))*100

print("The percentage of positive tweets:",percent_positive)
print("The percentage of negative tweets:",percent_negative)

The percentage of positive tweets: 52.44759518425717
The percentage of negative tweets: 47.55240481574283


**Apply a train test split**

In [118]:
X = df['text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=32, train_size = 0.75)

**Create a vectorizer model with scikit-learn using the Countvectorizer method.**

In [119]:
vectorizer = CountVectorizer()

# Train the model on X_train
X_train_CV = vectorizer.fit_transform(X_train)

# Create the X_test_CV matrix without re-training the model
X_test_CV = vectorizer.transform(X_test)

X_train_CV

<12272x15806 sparse matrix of type '<class 'numpy.int64'>'
	with 144578 stored elements in Compressed Sparse Row format>

**Train a logistic regression**

In [120]:
# Here we train the model only on the train dataset.
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train_CV, y_train)


# And now we compare both scores :
print("\nScore for the Train dataset :", model.score(X_train_CV, y_train))
print("Score for the Test dataset :", model.score(X_test_CV, y_test))


Score for the Train dataset : 0.9663461538461539
Score for the Test dataset : 0.8772916157418724


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Display 10 tweets that were badly predicted (false positive or false negative).**

In [124]:
predictions = model.predict(X_test_CV)

# Get the indices of the incorrectly predicted tweets
y_testArray = y_test.to_numpy()
X_testArray = X_test.to_numpy()

incorrect_indices = [i for i in range(len(y_test)) if y_testArray[i] != predictions[i]]

# Print the tweets that were incorrectly predicted
for i in incorrect_indices[:10]:
  print(f'Tweet: {X_testArray[i]}')
  print(f'Prediction: {predictions[i]}')
  print(f'Actual: {y_testArray[i]}')
  print()

Tweet: HollowbabesHere comes the utter shite #bgt <I completely agree
Prediction: positive
Actual: negative

Tweet:  SUFFICATION NO BREATHING. It`s okay. There`ll be more. You`re invited to mine, but I can`t promise fun times.  *Jinx
Prediction: positive
Actual: negative

Tweet: i wanna vote for Miley Cyrus for the mtv movie awards..but i don`t know where i could  somebody could send me a link? thaank you <3
Prediction: negative
Actual: positive

Tweet: I love music so much that i`ve gone through pain to play :S my sides of my fingers now are peeling and have blisters from playing so much
Prediction: positive
Actual: negative

Tweet: I can only message those who message me, if we`re fwends...so those that want replies..follow me.  hmm..that sounds funny..
Prediction: negative
Actual: positive

Tweet: wish I could feel no pain (8)  but it`s ok, at least they like Brazil!
Prediction: negative
Actual: positive

Tweet:  so glad i`m not at uni anymore
Prediction: negative
Actual: positive

