The challenge is from [here](https://www.aicrowd.com/challenges/ai-blitz-xii/problems/programming-language-classification/notebooks).

In [None]:
!pip install aicrowd-cli
%load_ext aicrowd.magic

In [None]:
%aicrowd login

In [None]:
!rm -rf data
!mkdir data
%aicrowd ds dl -c programming-language-classification -o data

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score,accuracy_score,f1_score

from sklearn import set_config
set_config(display="diagram")

plt.rcParams["figure.figsize"] = (15,6)

In [None]:
#load and read datasets
trainData = pd.read_csv("data/train.csv")
testData = pd.read_csv("data/test.csv")

In [None]:
#five first rows of trainData
trainData.head()

In [None]:
#five first rows of testData
testData.head()

In [None]:
#distribution of programming languages in a plot
sn.countplot(trainData["language"])

In [None]:
#import of label encoder. 
from sklearn.preprocessing import LabelEncoder
#encodes targets with value between o and nClasses-1 (labelling of targets)
LE = LabelEncoder().fit(trainData.language)
trainData["target"] = LE.transform(trainData.language)

In [None]:
trainData.head()

In [None]:
#splits train data in training, validation, test set
XTrain, XComb, YTrain, YComb = train_test_split(trainData["code"],trainData["target"],test_size=0.3,random_state=0 , shuffle = False) 
print(len(XTrain))
XValidation,XTest,YValidation,YTest = train_test_split(XComb,YComb,test_size=0.5,random_state=0, shuffle = False)


In [None]:
XTrain.shape,XValidation.shape,XTest.shape,YTrain.shape,YValidation.shape,YTest.shape

In [None]:
#CountVectorizers checks if a word appears in a array if yes the word gets the token 1 assigned if not the word gets the token 0 asssigned
#TfidfTransformer() transforms tokens from CountVectorizer() to term frequence represantation
#MultinomialNB() implements naive bayes algorithm for multinomally data
classifier = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
classifier = classifier.fit(XTrain, YTrain)

In [None]:
#architecture of the pipeline from teh classifier 
classifier

In [None]:
#F1 is used for grading intern of AIcrowd
#accuracy number of correct predictions/Total number of predictions (here for validation)
print("F1:" ,f1_score(YValidation,classifier.predict(XValidation),average='macro'))
print("Accuracy:" ,accuracy_score(YValidation,classifier.predict(XValidation))*100)

In [None]:
#F1 is used for grading intern of AIcrowd
#accuracy number of correct predictions/Total number of predictions (here for test)
print("F1:" ,f1_score(YTest,classifier.predict(XTest),average='macro'))
print("Accuracy:" ,accuracy_score(YTest,classifier.predict(XTest))*100)

In [None]:
testData.shape

In [None]:
#uses trained classifier to predict targets of our code
testData["target"] = classifier.predict(testData["code"])

In [None]:
testData.head()

In [None]:
#transforms the numbers which were predicted to the original name of the programming language
#this end result will be used for the submission
testData["prediction"] = LE.inverse_transform(testData.target)

In [None]:
testData = testData.sample(frac=1)
testData.head()

In [None]:
!rm -rf assets
!mkdir assets
testData.to_csv(os.path.join("assets","submission.csv"))

In [None]:
%aicrowd notebook submit -c programming-language-classification -a assets --no-verify